{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.7920341394025604, "eval_steps": 348, "global_step": 1392, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005689900426742532, "grad_norm": 0.922553300857544, "learning_rate": 2e-05, "loss": 1.7225, "step": 1 }, { "epoch": 0.0005689900426742532, "eval_loss": 1.6560131311416626, "eval_runtime": 17.2854, "eval_samples_per_second": 42.811, "eval_steps_per_second": 21.405, "step": 1 }, { "epoch": 0.0011379800853485065, "grad_norm": 1.0872293710708618, "learning_rate": 4e-05, "loss": 1.7777, "step": 2 }, { "epoch": 0.0017069701280227596, "grad_norm": 1.0032234191894531, "learning_rate": 6e-05, "loss": 1.6594, "step": 3 }, { "epoch": 0.002275960170697013, "grad_norm": 0.9296952486038208, "learning_rate": 8e-05, "loss": 1.6329, "step": 4 }, { "epoch": 0.002844950213371266, "grad_norm": 0.8549262881278992, "learning_rate": 0.0001, "loss": 1.6946, "step": 5 }, { "epoch": 0.0034139402560455193, "grad_norm": 0.7175059914588928, "learning_rate": 0.00012, "loss": 1.605, "step": 6 }, { "epoch": 0.003982930298719772, "grad_norm": 0.729087233543396, "learning_rate": 0.00014, "loss": 1.7539, "step": 7 }, { "epoch": 0.004551920341394026, "grad_norm": 0.7559539675712585, "learning_rate": 0.00016, "loss": 1.7079, "step": 8 }, { "epoch": 0.005120910384068279, "grad_norm": 0.9097371101379395, "learning_rate": 0.00018, "loss": 1.4693, "step": 9 }, { "epoch": 0.005689900426742532, "grad_norm": 0.7562863230705261, "learning_rate": 0.0002, "loss": 1.7192, "step": 10 }, { "epoch": 0.006258890469416785, "grad_norm": 0.8033550381660461, "learning_rate": 0.00019999974162322295, "loss": 1.6699, "step": 11 }, { "epoch": 0.0068278805120910386, "grad_norm": 0.6270872950553894, "learning_rate": 0.00019999896649422697, "loss": 1.7042, "step": 12 }, { "epoch": 0.007396870554765292, "grad_norm": 0.6003552079200745, "learning_rate": 0.00019999767461701748, "loss": 1.672, "step": 13 }, { "epoch": 0.007965860597439544, "grad_norm": 0.5751997232437134, "learning_rate": 0.00019999586599827042, "loss": 1.5727, "step": 14 }, { "epoch": 0.008534850640113799, "grad_norm": 0.5488961338996887, "learning_rate": 0.00019999354064733184, "loss": 1.6477, "step": 15 }, { "epoch": 0.009103840682788052, "grad_norm": 0.4690549671649933, "learning_rate": 0.00019999069857621807, "loss": 1.4063, "step": 16 }, { "epoch": 0.009672830725462305, "grad_norm": 0.5245763659477234, "learning_rate": 0.00019998733979961563, "loss": 1.649, "step": 17 }, { "epoch": 0.010241820768136558, "grad_norm": 0.4962601661682129, "learning_rate": 0.0001999834643348811, "loss": 1.5558, "step": 18 }, { "epoch": 0.010810810810810811, "grad_norm": 0.5009298324584961, "learning_rate": 0.0001999790722020411, "loss": 1.6178, "step": 19 }, { "epoch": 0.011379800853485065, "grad_norm": 0.5524196028709412, "learning_rate": 0.00019997416342379208, "loss": 1.6133, "step": 20 }, { "epoch": 0.011948790896159318, "grad_norm": 0.48095259070396423, "learning_rate": 0.00019996873802550043, "loss": 1.4158, "step": 21 }, { "epoch": 0.01251778093883357, "grad_norm": 0.5575169324874878, "learning_rate": 0.00019996279603520196, "loss": 1.7057, "step": 22 }, { "epoch": 0.013086770981507824, "grad_norm": 0.5423071384429932, "learning_rate": 0.00019995633748360223, "loss": 1.5661, "step": 23 }, { "epoch": 0.013655761024182077, "grad_norm": 0.49561819434165955, "learning_rate": 0.00019994936240407598, "loss": 1.4119, "step": 24 }, { "epoch": 0.01422475106685633, "grad_norm": 0.4862682521343231, "learning_rate": 0.00019994187083266716, "loss": 1.519, "step": 25 }, { "epoch": 0.014793741109530583, "grad_norm": 0.5174720883369446, "learning_rate": 0.0001999338628080888, "loss": 1.3668, "step": 26 }, { "epoch": 0.015362731152204837, "grad_norm": 0.5306721329689026, "learning_rate": 0.0001999253383717226, "loss": 1.6097, "step": 27 }, { "epoch": 0.015931721194879088, "grad_norm": 0.5307742357254028, "learning_rate": 0.00019991629756761886, "loss": 1.7738, "step": 28 }, { "epoch": 0.016500711237553343, "grad_norm": 0.6086705327033997, "learning_rate": 0.00019990674044249634, "loss": 1.7079, "step": 29 }, { "epoch": 0.017069701280227598, "grad_norm": 0.5047173500061035, "learning_rate": 0.00019989666704574175, "loss": 1.6998, "step": 30 }, { "epoch": 0.01763869132290185, "grad_norm": 0.5041013360023499, "learning_rate": 0.00019988607742940978, "loss": 1.7047, "step": 31 }, { "epoch": 0.018207681365576104, "grad_norm": 0.4694116413593292, "learning_rate": 0.00019987497164822263, "loss": 1.3058, "step": 32 }, { "epoch": 0.018776671408250355, "grad_norm": 0.5069786310195923, "learning_rate": 0.0001998633497595698, "loss": 1.6603, "step": 33 }, { "epoch": 0.01934566145092461, "grad_norm": 0.4877070486545563, "learning_rate": 0.0001998512118235078, "loss": 1.5145, "step": 34 }, { "epoch": 0.01991465149359886, "grad_norm": 0.5028818845748901, "learning_rate": 0.0001998385579027599, "loss": 1.5016, "step": 35 }, { "epoch": 0.020483641536273117, "grad_norm": 0.4918319880962372, "learning_rate": 0.00019982538806271566, "loss": 1.5468, "step": 36 }, { "epoch": 0.021052631578947368, "grad_norm": 0.5177620649337769, "learning_rate": 0.00019981170237143067, "loss": 1.5555, "step": 37 }, { "epoch": 0.021621621621621623, "grad_norm": 0.49115803837776184, "learning_rate": 0.00019979750089962629, "loss": 1.592, "step": 38 }, { "epoch": 0.022190611664295874, "grad_norm": 0.5621944069862366, "learning_rate": 0.00019978278372068906, "loss": 1.6697, "step": 39 }, { "epoch": 0.02275960170697013, "grad_norm": 0.49260076880455017, "learning_rate": 0.00019976755091067054, "loss": 1.4688, "step": 40 }, { "epoch": 0.02332859174964438, "grad_norm": 0.4910222589969635, "learning_rate": 0.00019975180254828688, "loss": 1.462, "step": 41 }, { "epoch": 0.023897581792318635, "grad_norm": 0.5017576217651367, "learning_rate": 0.0001997355387149182, "loss": 1.6558, "step": 42 }, { "epoch": 0.024466571834992887, "grad_norm": 0.5089415907859802, "learning_rate": 0.00019971875949460852, "loss": 1.6412, "step": 43 }, { "epoch": 0.02503556187766714, "grad_norm": 0.4794662594795227, "learning_rate": 0.00019970146497406505, "loss": 1.6011, "step": 44 }, { "epoch": 0.025604551920341393, "grad_norm": 0.5046934485435486, "learning_rate": 0.00019968365524265777, "loss": 1.6675, "step": 45 }, { "epoch": 0.026173541963015648, "grad_norm": 0.4993690550327301, "learning_rate": 0.0001996653303924192, "loss": 1.6735, "step": 46 }, { "epoch": 0.0267425320056899, "grad_norm": 0.48856502771377563, "learning_rate": 0.00019964649051804355, "loss": 1.5536, "step": 47 }, { "epoch": 0.027311522048364154, "grad_norm": 0.4920005202293396, "learning_rate": 0.0001996271357168866, "loss": 1.6204, "step": 48 }, { "epoch": 0.027880512091038406, "grad_norm": 0.5342410802841187, "learning_rate": 0.00019960726608896502, "loss": 1.719, "step": 49 }, { "epoch": 0.02844950213371266, "grad_norm": 0.5041580200195312, "learning_rate": 0.00019958688173695572, "loss": 1.7053, "step": 50 }, { "epoch": 0.029018492176386912, "grad_norm": 0.5237680077552795, "learning_rate": 0.00019956598276619562, "loss": 1.5091, "step": 51 }, { "epoch": 0.029587482219061167, "grad_norm": 0.4911646246910095, "learning_rate": 0.0001995445692846809, "loss": 1.6085, "step": 52 }, { "epoch": 0.030156472261735418, "grad_norm": 0.520005464553833, "learning_rate": 0.00019952264140306645, "loss": 1.4782, "step": 53 }, { "epoch": 0.030725462304409673, "grad_norm": 0.49788954854011536, "learning_rate": 0.0001995001992346654, "loss": 1.4905, "step": 54 }, { "epoch": 0.031294452347083924, "grad_norm": 0.5043379664421082, "learning_rate": 0.00019947724289544845, "loss": 1.6566, "step": 55 }, { "epoch": 0.031863442389758176, "grad_norm": 0.5547715425491333, "learning_rate": 0.00019945377250404328, "loss": 1.7227, "step": 56 }, { "epoch": 0.032432432432432434, "grad_norm": 0.5288915634155273, "learning_rate": 0.000199429788181734, "loss": 1.5921, "step": 57 }, { "epoch": 0.033001422475106686, "grad_norm": 0.5353677868843079, "learning_rate": 0.00019940529005246048, "loss": 1.5371, "step": 58 }, { "epoch": 0.03357041251778094, "grad_norm": 0.520143449306488, "learning_rate": 0.00019938027824281757, "loss": 1.6308, "step": 59 }, { "epoch": 0.034139402560455195, "grad_norm": 0.50368732213974, "learning_rate": 0.0001993547528820548, "loss": 1.4645, "step": 60 }, { "epoch": 0.03470839260312945, "grad_norm": 0.5326752066612244, "learning_rate": 0.0001993287141020753, "loss": 1.5832, "step": 61 }, { "epoch": 0.0352773826458037, "grad_norm": 0.48568812012672424, "learning_rate": 0.00019930216203743544, "loss": 1.4137, "step": 62 }, { "epoch": 0.03584637268847795, "grad_norm": 0.4832801818847656, "learning_rate": 0.0001992750968253439, "loss": 1.4713, "step": 63 }, { "epoch": 0.03641536273115221, "grad_norm": 0.49059394001960754, "learning_rate": 0.00019924751860566118, "loss": 1.6009, "step": 64 }, { "epoch": 0.03698435277382646, "grad_norm": 0.5292865633964539, "learning_rate": 0.0001992194275208987, "loss": 1.6339, "step": 65 }, { "epoch": 0.03755334281650071, "grad_norm": 0.520621120929718, "learning_rate": 0.00019919082371621811, "loss": 1.7033, "step": 66 }, { "epoch": 0.03812233285917496, "grad_norm": 0.5552493929862976, "learning_rate": 0.0001991617073394306, "loss": 1.5704, "step": 67 }, { "epoch": 0.03869132290184922, "grad_norm": 0.5199451446533203, "learning_rate": 0.0001991320785409961, "loss": 1.6266, "step": 68 }, { "epoch": 0.03926031294452347, "grad_norm": 0.540593147277832, "learning_rate": 0.0001991019374740225, "loss": 1.7327, "step": 69 }, { "epoch": 0.03982930298719772, "grad_norm": 0.5305120348930359, "learning_rate": 0.00019907128429426477, "loss": 1.6544, "step": 70 }, { "epoch": 0.040398293029871975, "grad_norm": 0.5247764587402344, "learning_rate": 0.00019904011916012433, "loss": 1.429, "step": 71 }, { "epoch": 0.04096728307254623, "grad_norm": 0.500156819820404, "learning_rate": 0.00019900844223264813, "loss": 1.6106, "step": 72 }, { "epoch": 0.041536273115220484, "grad_norm": 0.49794986844062805, "learning_rate": 0.00019897625367552784, "loss": 1.5322, "step": 73 }, { "epoch": 0.042105263157894736, "grad_norm": 0.5475789308547974, "learning_rate": 0.00019894355365509894, "loss": 1.4882, "step": 74 }, { "epoch": 0.04267425320056899, "grad_norm": 0.5272343158721924, "learning_rate": 0.00019891034234033995, "loss": 1.5119, "step": 75 }, { "epoch": 0.043243243243243246, "grad_norm": 0.4892237186431885, "learning_rate": 0.00019887661990287153, "loss": 1.5567, "step": 76 }, { "epoch": 0.0438122332859175, "grad_norm": 0.528414249420166, "learning_rate": 0.00019884238651695556, "loss": 1.7716, "step": 77 }, { "epoch": 0.04438122332859175, "grad_norm": 0.5159140229225159, "learning_rate": 0.00019880764235949427, "loss": 1.6873, "step": 78 }, { "epoch": 0.044950213371266, "grad_norm": 0.5157197713851929, "learning_rate": 0.0001987723876100294, "loss": 1.5196, "step": 79 }, { "epoch": 0.04551920341394026, "grad_norm": 0.518205463886261, "learning_rate": 0.00019873662245074102, "loss": 1.5238, "step": 80 }, { "epoch": 0.04608819345661451, "grad_norm": 0.5316376090049744, "learning_rate": 0.00019870034706644693, "loss": 1.4913, "step": 81 }, { "epoch": 0.04665718349928876, "grad_norm": 0.5020834803581238, "learning_rate": 0.00019866356164460145, "loss": 1.4051, "step": 82 }, { "epoch": 0.04722617354196301, "grad_norm": 0.4912559986114502, "learning_rate": 0.00019862626637529455, "loss": 1.4947, "step": 83 }, { "epoch": 0.04779516358463727, "grad_norm": 0.5261936187744141, "learning_rate": 0.00019858846145125086, "loss": 1.659, "step": 84 }, { "epoch": 0.04836415362731152, "grad_norm": 0.5002409815788269, "learning_rate": 0.00019855014706782867, "loss": 1.4743, "step": 85 }, { "epoch": 0.048933143669985774, "grad_norm": 0.5293824672698975, "learning_rate": 0.0001985113234230189, "loss": 1.5796, "step": 86 }, { "epoch": 0.049502133712660025, "grad_norm": 0.49084582924842834, "learning_rate": 0.00019847199071744415, "loss": 1.6052, "step": 87 }, { "epoch": 0.05007112375533428, "grad_norm": 0.5251219868659973, "learning_rate": 0.00019843214915435758, "loss": 1.7684, "step": 88 }, { "epoch": 0.050640113798008535, "grad_norm": 0.5003427267074585, "learning_rate": 0.0001983917989396418, "loss": 1.5715, "step": 89 }, { "epoch": 0.051209103840682786, "grad_norm": 0.5283729434013367, "learning_rate": 0.0001983509402818081, "loss": 1.5396, "step": 90 }, { "epoch": 0.051778093883357044, "grad_norm": 0.49652016162872314, "learning_rate": 0.00019830957339199494, "loss": 1.5353, "step": 91 }, { "epoch": 0.052347083926031296, "grad_norm": 0.49297675490379333, "learning_rate": 0.00019826769848396727, "loss": 1.5012, "step": 92 }, { "epoch": 0.05291607396870555, "grad_norm": 0.5100125670433044, "learning_rate": 0.0001982253157741151, "loss": 1.6194, "step": 93 }, { "epoch": 0.0534850640113798, "grad_norm": 0.5218221545219421, "learning_rate": 0.00019818242548145265, "loss": 1.6505, "step": 94 }, { "epoch": 0.05405405405405406, "grad_norm": 0.5490546226501465, "learning_rate": 0.000198139027827617, "loss": 1.498, "step": 95 }, { "epoch": 0.05462304409672831, "grad_norm": 0.5228062868118286, "learning_rate": 0.00019809512303686706, "loss": 1.4592, "step": 96 }, { "epoch": 0.05519203413940256, "grad_norm": 0.49827295541763306, "learning_rate": 0.00019805071133608242, "loss": 1.6593, "step": 97 }, { "epoch": 0.05576102418207681, "grad_norm": 0.5081865191459656, "learning_rate": 0.0001980057929547621, "loss": 1.4226, "step": 98 }, { "epoch": 0.05633001422475107, "grad_norm": 0.5018671751022339, "learning_rate": 0.00019796036812502347, "loss": 1.4995, "step": 99 }, { "epoch": 0.05689900426742532, "grad_norm": 0.5807016491889954, "learning_rate": 0.00019791443708160094, "loss": 1.7405, "step": 100 }, { "epoch": 0.05746799431009957, "grad_norm": 0.5095066428184509, "learning_rate": 0.00019786800006184473, "loss": 1.4908, "step": 101 }, { "epoch": 0.058036984352773824, "grad_norm": 0.5552268028259277, "learning_rate": 0.00019782105730571992, "loss": 1.5289, "step": 102 }, { "epoch": 0.05860597439544808, "grad_norm": 0.47026970982551575, "learning_rate": 0.00019777360905580478, "loss": 1.3497, "step": 103 }, { "epoch": 0.059174964438122334, "grad_norm": 0.5475593209266663, "learning_rate": 0.00019772565555728984, "loss": 1.6329, "step": 104 }, { "epoch": 0.059743954480796585, "grad_norm": 0.5217400789260864, "learning_rate": 0.00019767719705797657, "loss": 1.6181, "step": 105 }, { "epoch": 0.060312944523470836, "grad_norm": 0.5143265128135681, "learning_rate": 0.00019762823380827592, "loss": 1.6369, "step": 106 }, { "epoch": 0.060881934566145095, "grad_norm": 0.501568615436554, "learning_rate": 0.0001975787660612072, "loss": 1.6871, "step": 107 }, { "epoch": 0.061450924608819346, "grad_norm": 0.47950610518455505, "learning_rate": 0.00019752879407239685, "loss": 1.4494, "step": 108 }, { "epoch": 0.0620199146514936, "grad_norm": 0.5488466024398804, "learning_rate": 0.0001974783181000768, "loss": 1.6457, "step": 109 }, { "epoch": 0.06258890469416785, "grad_norm": 0.5165080428123474, "learning_rate": 0.0001974273384050835, "loss": 1.5463, "step": 110 }, { "epoch": 0.06315789473684211, "grad_norm": 0.5002058744430542, "learning_rate": 0.0001973758552508563, "loss": 1.4333, "step": 111 }, { "epoch": 0.06372688477951635, "grad_norm": 0.4927598237991333, "learning_rate": 0.00019732386890343624, "loss": 1.5576, "step": 112 }, { "epoch": 0.06429587482219061, "grad_norm": 0.5156055688858032, "learning_rate": 0.0001972713796314646, "loss": 1.4821, "step": 113 }, { "epoch": 0.06486486486486487, "grad_norm": 0.5108924508094788, "learning_rate": 0.0001972183877061816, "loss": 1.502, "step": 114 }, { "epoch": 0.06543385490753911, "grad_norm": 0.5052126049995422, "learning_rate": 0.00019716489340142483, "loss": 1.7285, "step": 115 }, { "epoch": 0.06600284495021337, "grad_norm": 0.5034211874008179, "learning_rate": 0.00019711089699362807, "loss": 1.4148, "step": 116 }, { "epoch": 0.06657183499288763, "grad_norm": 0.5284733772277832, "learning_rate": 0.00019705639876181969, "loss": 1.5979, "step": 117 }, { "epoch": 0.06714082503556187, "grad_norm": 0.5434923768043518, "learning_rate": 0.0001970013989876212, "loss": 1.6856, "step": 118 }, { "epoch": 0.06770981507823613, "grad_norm": 0.48895972967147827, "learning_rate": 0.00019694589795524588, "loss": 1.5305, "step": 119 }, { "epoch": 0.06827880512091039, "grad_norm": 0.5481955409049988, "learning_rate": 0.00019688989595149732, "loss": 1.473, "step": 120 }, { "epoch": 0.06884779516358464, "grad_norm": 0.47966116666793823, "learning_rate": 0.00019683339326576781, "loss": 1.1899, "step": 121 }, { "epoch": 0.0694167852062589, "grad_norm": 0.5007337927818298, "learning_rate": 0.00019677639019003706, "loss": 1.4747, "step": 122 }, { "epoch": 0.06998577524893314, "grad_norm": 0.5798030495643616, "learning_rate": 0.00019671888701887046, "loss": 1.5881, "step": 123 }, { "epoch": 0.0705547652916074, "grad_norm": 0.5382363200187683, "learning_rate": 0.0001966608840494177, "loss": 1.6345, "step": 124 }, { "epoch": 0.07112375533428165, "grad_norm": 0.5181685090065002, "learning_rate": 0.00019660238158141112, "loss": 1.48, "step": 125 }, { "epoch": 0.0716927453769559, "grad_norm": 0.5349889993667603, "learning_rate": 0.0001965433799171644, "loss": 1.5679, "step": 126 }, { "epoch": 0.07226173541963016, "grad_norm": 0.496991902589798, "learning_rate": 0.00019648387936157068, "loss": 1.5596, "step": 127 }, { "epoch": 0.07283072546230442, "grad_norm": 0.5177836418151855, "learning_rate": 0.0001964238802221012, "loss": 1.3765, "step": 128 }, { "epoch": 0.07339971550497866, "grad_norm": 0.5253962874412537, "learning_rate": 0.00019636338280880366, "loss": 1.7268, "step": 129 }, { "epoch": 0.07396870554765292, "grad_norm": 0.5878409743309021, "learning_rate": 0.00019630238743430058, "loss": 1.5933, "step": 130 }, { "epoch": 0.07453769559032716, "grad_norm": 0.5072840452194214, "learning_rate": 0.00019624089441378775, "loss": 1.3819, "step": 131 }, { "epoch": 0.07510668563300142, "grad_norm": 0.5567812323570251, "learning_rate": 0.0001961789040650325, "loss": 1.5582, "step": 132 }, { "epoch": 0.07567567567567568, "grad_norm": 0.48109254240989685, "learning_rate": 0.00019611641670837219, "loss": 1.4227, "step": 133 }, { "epoch": 0.07624466571834992, "grad_norm": 0.5404167175292969, "learning_rate": 0.00019605343266671245, "loss": 1.6807, "step": 134 }, { "epoch": 0.07681365576102418, "grad_norm": 0.47476792335510254, "learning_rate": 0.00019598995226552556, "loss": 1.3462, "step": 135 }, { "epoch": 0.07738264580369844, "grad_norm": 0.4884220361709595, "learning_rate": 0.0001959259758328487, "loss": 1.5956, "step": 136 }, { "epoch": 0.07795163584637269, "grad_norm": 0.5190904140472412, "learning_rate": 0.00019586150369928245, "loss": 1.6685, "step": 137 }, { "epoch": 0.07852062588904694, "grad_norm": 0.513028621673584, "learning_rate": 0.0001957965361979888, "loss": 1.7023, "step": 138 }, { "epoch": 0.07908961593172119, "grad_norm": 0.4926295578479767, "learning_rate": 0.00019573107366468962, "loss": 1.4606, "step": 139 }, { "epoch": 0.07965860597439545, "grad_norm": 0.5009914636611938, "learning_rate": 0.00019566511643766485, "loss": 1.5636, "step": 140 }, { "epoch": 0.0802275960170697, "grad_norm": 0.54355388879776, "learning_rate": 0.00019559866485775084, "loss": 1.681, "step": 141 }, { "epoch": 0.08079658605974395, "grad_norm": 0.5059416890144348, "learning_rate": 0.00019553171926833853, "loss": 1.6193, "step": 142 }, { "epoch": 0.08136557610241821, "grad_norm": 0.5309209227561951, "learning_rate": 0.00019546428001537155, "loss": 1.5552, "step": 143 }, { "epoch": 0.08193456614509247, "grad_norm": 0.4913862943649292, "learning_rate": 0.0001953963474473447, "loss": 1.5506, "step": 144 }, { "epoch": 0.08250355618776671, "grad_norm": 0.5331928133964539, "learning_rate": 0.0001953279219153019, "loss": 1.7152, "step": 145 }, { "epoch": 0.08307254623044097, "grad_norm": 0.5169084072113037, "learning_rate": 0.00019525900377283457, "loss": 1.6177, "step": 146 }, { "epoch": 0.08364153627311523, "grad_norm": 0.5159075856208801, "learning_rate": 0.00019518959337607957, "loss": 1.5652, "step": 147 }, { "epoch": 0.08421052631578947, "grad_norm": 0.5606206655502319, "learning_rate": 0.0001951196910837177, "loss": 1.6821, "step": 148 }, { "epoch": 0.08477951635846373, "grad_norm": 0.47890591621398926, "learning_rate": 0.0001950492972569715, "loss": 1.5041, "step": 149 }, { "epoch": 0.08534850640113797, "grad_norm": 0.5077673196792603, "learning_rate": 0.0001949784122596035, "loss": 1.5837, "step": 150 }, { "epoch": 0.08591749644381223, "grad_norm": 0.5021458268165588, "learning_rate": 0.00019490703645791454, "loss": 1.5813, "step": 151 }, { "epoch": 0.08648648648648649, "grad_norm": 0.5000331997871399, "learning_rate": 0.00019483517022074156, "loss": 1.5686, "step": 152 }, { "epoch": 0.08705547652916074, "grad_norm": 0.5121405124664307, "learning_rate": 0.0001947628139194559, "loss": 1.4329, "step": 153 }, { "epoch": 0.087624466571835, "grad_norm": 0.5058543682098389, "learning_rate": 0.00019468996792796137, "loss": 1.36, "step": 154 }, { "epoch": 0.08819345661450925, "grad_norm": 0.5810546875, "learning_rate": 0.00019461663262269213, "loss": 1.3764, "step": 155 }, { "epoch": 0.0887624466571835, "grad_norm": 0.5015589594841003, "learning_rate": 0.00019454280838261106, "loss": 1.4966, "step": 156 }, { "epoch": 0.08933143669985776, "grad_norm": 0.5284256339073181, "learning_rate": 0.0001944684955892075, "loss": 1.4944, "step": 157 }, { "epoch": 0.089900426742532, "grad_norm": 0.49957889318466187, "learning_rate": 0.0001943936946264955, "loss": 1.4641, "step": 158 }, { "epoch": 0.09046941678520626, "grad_norm": 0.5073912143707275, "learning_rate": 0.00019431840588101157, "loss": 1.3371, "step": 159 }, { "epoch": 0.09103840682788052, "grad_norm": 0.5323196649551392, "learning_rate": 0.00019424262974181313, "loss": 1.5312, "step": 160 }, { "epoch": 0.09160739687055476, "grad_norm": 0.5276457071304321, "learning_rate": 0.00019416636660047595, "loss": 1.64, "step": 161 }, { "epoch": 0.09217638691322902, "grad_norm": 0.49499741196632385, "learning_rate": 0.0001940896168510926, "loss": 1.3689, "step": 162 }, { "epoch": 0.09274537695590328, "grad_norm": 0.5169721245765686, "learning_rate": 0.00019401238089027017, "loss": 1.5352, "step": 163 }, { "epoch": 0.09331436699857752, "grad_norm": 0.48859354853630066, "learning_rate": 0.0001939346591171281, "loss": 1.4584, "step": 164 }, { "epoch": 0.09388335704125178, "grad_norm": 0.5150989890098572, "learning_rate": 0.00019385645193329654, "loss": 1.5178, "step": 165 }, { "epoch": 0.09445234708392602, "grad_norm": 0.48626863956451416, "learning_rate": 0.00019377775974291383, "loss": 1.3689, "step": 166 }, { "epoch": 0.09502133712660028, "grad_norm": 0.5352733731269836, "learning_rate": 0.0001936985829526247, "loss": 1.5953, "step": 167 }, { "epoch": 0.09559032716927454, "grad_norm": 0.5061799883842468, "learning_rate": 0.00019361892197157797, "loss": 1.6339, "step": 168 }, { "epoch": 0.09615931721194879, "grad_norm": 0.5095758438110352, "learning_rate": 0.0001935387772114246, "loss": 1.5116, "step": 169 }, { "epoch": 0.09672830725462304, "grad_norm": 0.4948934316635132, "learning_rate": 0.00019345814908631556, "loss": 1.3963, "step": 170 }, { "epoch": 0.0972972972972973, "grad_norm": 0.5632720589637756, "learning_rate": 0.0001933770380128995, "loss": 1.618, "step": 171 }, { "epoch": 0.09786628733997155, "grad_norm": 0.5013827681541443, "learning_rate": 0.00019329544441032076, "loss": 1.4847, "step": 172 }, { "epoch": 0.0984352773826458, "grad_norm": 0.512117326259613, "learning_rate": 0.0001932133687002172, "loss": 1.4346, "step": 173 }, { "epoch": 0.09900426742532005, "grad_norm": 0.5385090708732605, "learning_rate": 0.00019313081130671798, "loss": 1.6694, "step": 174 }, { "epoch": 0.09957325746799431, "grad_norm": 0.5616840720176697, "learning_rate": 0.00019304777265644133, "loss": 1.5638, "step": 175 }, { "epoch": 0.10014224751066857, "grad_norm": 0.5222409963607788, "learning_rate": 0.0001929642531784925, "loss": 1.6203, "step": 176 }, { "epoch": 0.10071123755334281, "grad_norm": 0.5733211040496826, "learning_rate": 0.00019288025330446126, "loss": 1.6952, "step": 177 }, { "epoch": 0.10128022759601707, "grad_norm": 0.5625792741775513, "learning_rate": 0.00019279577346842, "loss": 1.6639, "step": 178 }, { "epoch": 0.10184921763869133, "grad_norm": 0.5778010487556458, "learning_rate": 0.0001927108141069213, "loss": 1.5719, "step": 179 }, { "epoch": 0.10241820768136557, "grad_norm": 0.5034694671630859, "learning_rate": 0.00019262537565899564, "loss": 1.4461, "step": 180 }, { "epoch": 0.10298719772403983, "grad_norm": 0.5446426272392273, "learning_rate": 0.0001925394585661492, "loss": 1.4904, "step": 181 }, { "epoch": 0.10355618776671409, "grad_norm": 0.47503742575645447, "learning_rate": 0.00019245306327236172, "loss": 1.5012, "step": 182 }, { "epoch": 0.10412517780938833, "grad_norm": 0.5337246656417847, "learning_rate": 0.00019236619022408387, "loss": 1.4175, "step": 183 }, { "epoch": 0.10469416785206259, "grad_norm": 0.5157039165496826, "learning_rate": 0.00019227883987023523, "loss": 1.6435, "step": 184 }, { "epoch": 0.10526315789473684, "grad_norm": 0.5278623700141907, "learning_rate": 0.00019219101266220188, "loss": 1.6746, "step": 185 }, { "epoch": 0.1058321479374111, "grad_norm": 0.4916015565395355, "learning_rate": 0.000192102709053834, "loss": 1.4584, "step": 186 }, { "epoch": 0.10640113798008535, "grad_norm": 0.5512337684631348, "learning_rate": 0.00019201392950144363, "loss": 1.6313, "step": 187 }, { "epoch": 0.1069701280227596, "grad_norm": 0.506673276424408, "learning_rate": 0.0001919246744638023, "loss": 1.4842, "step": 188 }, { "epoch": 0.10753911806543386, "grad_norm": 0.49428772926330566, "learning_rate": 0.00019183494440213857, "loss": 1.4246, "step": 189 }, { "epoch": 0.10810810810810811, "grad_norm": 0.5020580887794495, "learning_rate": 0.0001917447397801357, "loss": 1.6966, "step": 190 }, { "epoch": 0.10867709815078236, "grad_norm": 0.5004864931106567, "learning_rate": 0.00019165406106392928, "loss": 1.3144, "step": 191 }, { "epoch": 0.10924608819345662, "grad_norm": 0.47853466868400574, "learning_rate": 0.00019156290872210488, "loss": 1.3321, "step": 192 }, { "epoch": 0.10981507823613086, "grad_norm": 0.4940144121646881, "learning_rate": 0.00019147128322569533, "loss": 1.2719, "step": 193 }, { "epoch": 0.11038406827880512, "grad_norm": 0.5355538725852966, "learning_rate": 0.00019137918504817878, "loss": 1.4551, "step": 194 }, { "epoch": 0.11095305832147938, "grad_norm": 0.5604861378669739, "learning_rate": 0.00019128661466547576, "loss": 1.6109, "step": 195 }, { "epoch": 0.11152204836415362, "grad_norm": 0.5061023235321045, "learning_rate": 0.000191193572555947, "loss": 1.511, "step": 196 }, { "epoch": 0.11209103840682788, "grad_norm": 0.5125574469566345, "learning_rate": 0.0001911000592003909, "loss": 1.4209, "step": 197 }, { "epoch": 0.11266002844950214, "grad_norm": 0.5150197744369507, "learning_rate": 0.00019100607508204114, "loss": 1.6323, "step": 198 }, { "epoch": 0.11322901849217638, "grad_norm": 0.5164692997932434, "learning_rate": 0.0001909116206865639, "loss": 1.5086, "step": 199 }, { "epoch": 0.11379800853485064, "grad_norm": 0.5399172306060791, "learning_rate": 0.00019081669650205564, "loss": 1.5051, "step": 200 }, { "epoch": 0.11436699857752489, "grad_norm": 0.49494683742523193, "learning_rate": 0.0001907213030190405, "loss": 1.5123, "step": 201 }, { "epoch": 0.11493598862019914, "grad_norm": 0.5344505906105042, "learning_rate": 0.00019062544073046768, "loss": 1.5364, "step": 202 }, { "epoch": 0.1155049786628734, "grad_norm": 0.5201467871665955, "learning_rate": 0.00019052911013170892, "loss": 1.5027, "step": 203 }, { "epoch": 0.11607396870554765, "grad_norm": 0.5991513729095459, "learning_rate": 0.00019043231172055603, "loss": 1.6402, "step": 204 }, { "epoch": 0.1166429587482219, "grad_norm": 0.5526711940765381, "learning_rate": 0.00019033504599721827, "loss": 1.6166, "step": 205 }, { "epoch": 0.11721194879089616, "grad_norm": 0.493965208530426, "learning_rate": 0.00019023731346431972, "loss": 1.3099, "step": 206 }, { "epoch": 0.11778093883357041, "grad_norm": 0.5043678879737854, "learning_rate": 0.00019013911462689668, "loss": 1.3328, "step": 207 }, { "epoch": 0.11834992887624467, "grad_norm": 0.518515944480896, "learning_rate": 0.00019004044999239517, "loss": 1.453, "step": 208 }, { "epoch": 0.11891891891891893, "grad_norm": 0.547725260257721, "learning_rate": 0.00018994132007066816, "loss": 1.552, "step": 209 }, { "epoch": 0.11948790896159317, "grad_norm": 0.5498734712600708, "learning_rate": 0.0001898417253739731, "loss": 1.6076, "step": 210 }, { "epoch": 0.12005689900426743, "grad_norm": 0.5087684392929077, "learning_rate": 0.00018974166641696908, "loss": 1.3459, "step": 211 }, { "epoch": 0.12062588904694167, "grad_norm": 0.49864476919174194, "learning_rate": 0.00018964114371671428, "loss": 1.502, "step": 212 }, { "epoch": 0.12119487908961593, "grad_norm": 0.49818646907806396, "learning_rate": 0.0001895401577926634, "loss": 1.5047, "step": 213 }, { "epoch": 0.12176386913229019, "grad_norm": 0.5151641964912415, "learning_rate": 0.00018943870916666476, "loss": 1.5276, "step": 214 }, { "epoch": 0.12233285917496443, "grad_norm": 0.5294698476791382, "learning_rate": 0.00018933679836295777, "loss": 1.4735, "step": 215 }, { "epoch": 0.12290184921763869, "grad_norm": 0.5169737339019775, "learning_rate": 0.0001892344259081701, "loss": 1.6458, "step": 216 }, { "epoch": 0.12347083926031295, "grad_norm": 0.5262957811355591, "learning_rate": 0.000189131592331315, "loss": 1.6239, "step": 217 }, { "epoch": 0.1240398293029872, "grad_norm": 0.5043689012527466, "learning_rate": 0.00018902829816378876, "loss": 1.5785, "step": 218 }, { "epoch": 0.12460881934566145, "grad_norm": 0.5032008290290833, "learning_rate": 0.00018892454393936754, "loss": 1.4075, "step": 219 }, { "epoch": 0.1251778093883357, "grad_norm": 0.5261518359184265, "learning_rate": 0.00018882033019420504, "loss": 1.4251, "step": 220 }, { "epoch": 0.12574679943100997, "grad_norm": 0.5519723296165466, "learning_rate": 0.00018871565746682949, "loss": 1.6654, "step": 221 }, { "epoch": 0.12631578947368421, "grad_norm": 0.5465745329856873, "learning_rate": 0.0001886105262981409, "loss": 1.5489, "step": 222 }, { "epoch": 0.12688477951635846, "grad_norm": 0.6040769219398499, "learning_rate": 0.00018850493723140835, "loss": 1.6205, "step": 223 }, { "epoch": 0.1274537695590327, "grad_norm": 0.5207870006561279, "learning_rate": 0.0001883988908122671, "loss": 1.5843, "step": 224 }, { "epoch": 0.12802275960170698, "grad_norm": 0.5130170583724976, "learning_rate": 0.00018829238758871574, "loss": 1.5384, "step": 225 }, { "epoch": 0.12859174964438122, "grad_norm": 0.5100380182266235, "learning_rate": 0.00018818542811111354, "loss": 1.5026, "step": 226 }, { "epoch": 0.12916073968705546, "grad_norm": 0.5047493577003479, "learning_rate": 0.00018807801293217735, "loss": 1.4774, "step": 227 }, { "epoch": 0.12972972972972974, "grad_norm": 0.5392350554466248, "learning_rate": 0.0001879701426069789, "loss": 1.2986, "step": 228 }, { "epoch": 0.13029871977240398, "grad_norm": 0.4927089810371399, "learning_rate": 0.00018786181769294203, "loss": 1.3298, "step": 229 }, { "epoch": 0.13086770981507823, "grad_norm": 0.5079994797706604, "learning_rate": 0.0001877530387498395, "loss": 1.4027, "step": 230 }, { "epoch": 0.1314366998577525, "grad_norm": 0.5074231624603271, "learning_rate": 0.00018764380633979035, "loss": 1.6176, "step": 231 }, { "epoch": 0.13200568990042674, "grad_norm": 0.5501790642738342, "learning_rate": 0.00018753412102725698, "loss": 1.3795, "step": 232 }, { "epoch": 0.132574679943101, "grad_norm": 0.5117084383964539, "learning_rate": 0.00018742398337904213, "loss": 1.4731, "step": 233 }, { "epoch": 0.13314366998577526, "grad_norm": 0.5027900338172913, "learning_rate": 0.00018731339396428607, "loss": 1.5399, "step": 234 }, { "epoch": 0.1337126600284495, "grad_norm": 0.5187605619430542, "learning_rate": 0.00018720235335446342, "loss": 1.5111, "step": 235 }, { "epoch": 0.13428165007112375, "grad_norm": 0.5272188782691956, "learning_rate": 0.00018709086212338058, "loss": 1.5717, "step": 236 }, { "epoch": 0.13485064011379802, "grad_norm": 0.5339289903640747, "learning_rate": 0.00018697892084717238, "loss": 1.4529, "step": 237 }, { "epoch": 0.13541963015647226, "grad_norm": 0.5382213592529297, "learning_rate": 0.00018686653010429937, "loss": 1.5727, "step": 238 }, { "epoch": 0.1359886201991465, "grad_norm": 0.5148522257804871, "learning_rate": 0.00018675369047554475, "loss": 1.5683, "step": 239 }, { "epoch": 0.13655761024182078, "grad_norm": 0.5300989747047424, "learning_rate": 0.00018664040254401121, "loss": 1.6485, "step": 240 }, { "epoch": 0.13712660028449503, "grad_norm": 0.5400955080986023, "learning_rate": 0.00018652666689511824, "loss": 1.5095, "step": 241 }, { "epoch": 0.13769559032716927, "grad_norm": 0.49695253372192383, "learning_rate": 0.0001864124841165988, "loss": 1.3692, "step": 242 }, { "epoch": 0.13826458036984351, "grad_norm": 0.5431788563728333, "learning_rate": 0.00018629785479849656, "loss": 1.5774, "step": 243 }, { "epoch": 0.1388335704125178, "grad_norm": 0.5125901103019714, "learning_rate": 0.00018618277953316245, "loss": 1.3545, "step": 244 }, { "epoch": 0.13940256045519203, "grad_norm": 0.5172457695007324, "learning_rate": 0.0001860672589152521, "loss": 1.5196, "step": 245 }, { "epoch": 0.13997155049786628, "grad_norm": 0.5287220478057861, "learning_rate": 0.00018595129354172235, "loss": 1.7279, "step": 246 }, { "epoch": 0.14054054054054055, "grad_norm": 0.5728311538696289, "learning_rate": 0.00018583488401182843, "loss": 1.5514, "step": 247 }, { "epoch": 0.1411095305832148, "grad_norm": 0.5267804861068726, "learning_rate": 0.0001857180309271207, "loss": 1.5115, "step": 248 }, { "epoch": 0.14167852062588904, "grad_norm": 0.5459727644920349, "learning_rate": 0.00018560073489144166, "loss": 1.5057, "step": 249 }, { "epoch": 0.1422475106685633, "grad_norm": 0.5065287947654724, "learning_rate": 0.00018548299651092269, "loss": 1.4906, "step": 250 }, { "epoch": 0.14281650071123755, "grad_norm": 0.5647059082984924, "learning_rate": 0.00018536481639398107, "loss": 1.5447, "step": 251 }, { "epoch": 0.1433854907539118, "grad_norm": 0.5164194703102112, "learning_rate": 0.00018524619515131679, "loss": 1.6922, "step": 252 }, { "epoch": 0.14395448079658607, "grad_norm": 0.5288499593734741, "learning_rate": 0.0001851271333959093, "loss": 1.5596, "step": 253 }, { "epoch": 0.14452347083926032, "grad_norm": 0.509348452091217, "learning_rate": 0.00018500763174301448, "loss": 1.6263, "step": 254 }, { "epoch": 0.14509246088193456, "grad_norm": 0.5377824902534485, "learning_rate": 0.00018488769081016133, "loss": 1.4711, "step": 255 }, { "epoch": 0.14566145092460883, "grad_norm": 0.5068728923797607, "learning_rate": 0.00018476731121714894, "loss": 1.6706, "step": 256 }, { "epoch": 0.14623044096728308, "grad_norm": 0.5097038745880127, "learning_rate": 0.0001846464935860431, "loss": 1.5841, "step": 257 }, { "epoch": 0.14679943100995732, "grad_norm": 0.5391016006469727, "learning_rate": 0.0001845252385411732, "loss": 1.6935, "step": 258 }, { "epoch": 0.14736842105263157, "grad_norm": 0.5154038667678833, "learning_rate": 0.00018440354670912906, "loss": 1.3827, "step": 259 }, { "epoch": 0.14793741109530584, "grad_norm": 0.5789750814437866, "learning_rate": 0.00018428141871875743, "loss": 1.545, "step": 260 }, { "epoch": 0.14850640113798008, "grad_norm": 0.5456128716468811, "learning_rate": 0.00018415885520115915, "loss": 1.5359, "step": 261 }, { "epoch": 0.14907539118065433, "grad_norm": 0.6158856749534607, "learning_rate": 0.00018403585678968551, "loss": 1.7601, "step": 262 }, { "epoch": 0.1496443812233286, "grad_norm": 0.4721933603286743, "learning_rate": 0.00018391242411993516, "loss": 1.3328, "step": 263 }, { "epoch": 0.15021337126600284, "grad_norm": 0.5242535471916199, "learning_rate": 0.00018378855782975084, "loss": 1.3359, "step": 264 }, { "epoch": 0.1507823613086771, "grad_norm": 0.5116239190101624, "learning_rate": 0.000183664258559216, "loss": 1.218, "step": 265 }, { "epoch": 0.15135135135135136, "grad_norm": 0.5715349316596985, "learning_rate": 0.0001835395269506515, "loss": 1.7737, "step": 266 }, { "epoch": 0.1519203413940256, "grad_norm": 0.5294284224510193, "learning_rate": 0.0001834143636486124, "loss": 1.7273, "step": 267 }, { "epoch": 0.15248933143669985, "grad_norm": 0.5225195288658142, "learning_rate": 0.0001832887692998845, "loss": 1.5397, "step": 268 }, { "epoch": 0.15305832147937412, "grad_norm": 0.5032251477241516, "learning_rate": 0.00018316274455348105, "loss": 1.4483, "step": 269 }, { "epoch": 0.15362731152204837, "grad_norm": 0.5733814835548401, "learning_rate": 0.00018303629006063943, "loss": 1.5798, "step": 270 }, { "epoch": 0.1541963015647226, "grad_norm": 0.5273986458778381, "learning_rate": 0.0001829094064748177, "loss": 1.6515, "step": 271 }, { "epoch": 0.15476529160739688, "grad_norm": 0.563911497592926, "learning_rate": 0.00018278209445169135, "loss": 1.6408, "step": 272 }, { "epoch": 0.15533428165007113, "grad_norm": 0.5052376985549927, "learning_rate": 0.00018265435464914973, "loss": 1.3572, "step": 273 }, { "epoch": 0.15590327169274537, "grad_norm": 0.5052018761634827, "learning_rate": 0.0001825261877272928, "loss": 1.5019, "step": 274 }, { "epoch": 0.15647226173541964, "grad_norm": 0.4795508086681366, "learning_rate": 0.00018239759434842773, "loss": 1.0659, "step": 275 }, { "epoch": 0.1570412517780939, "grad_norm": 0.5224232077598572, "learning_rate": 0.00018226857517706537, "loss": 1.6048, "step": 276 }, { "epoch": 0.15761024182076813, "grad_norm": 0.5337119698524475, "learning_rate": 0.00018213913087991685, "loss": 1.4884, "step": 277 }, { "epoch": 0.15817923186344238, "grad_norm": 0.48973479866981506, "learning_rate": 0.0001820092621258902, "loss": 1.3599, "step": 278 }, { "epoch": 0.15874822190611665, "grad_norm": 0.4995887577533722, "learning_rate": 0.0001818789695860868, "loss": 1.5088, "step": 279 }, { "epoch": 0.1593172119487909, "grad_norm": 0.513390064239502, "learning_rate": 0.00018174825393379798, "loss": 1.5376, "step": 280 }, { "epoch": 0.15988620199146514, "grad_norm": 0.5285114645957947, "learning_rate": 0.00018161711584450152, "loss": 1.706, "step": 281 }, { "epoch": 0.1604551920341394, "grad_norm": 0.5384095907211304, "learning_rate": 0.00018148555599585816, "loss": 1.474, "step": 282 }, { "epoch": 0.16102418207681365, "grad_norm": 0.5326551795005798, "learning_rate": 0.0001813535750677081, "loss": 1.4764, "step": 283 }, { "epoch": 0.1615931721194879, "grad_norm": 0.538357675075531, "learning_rate": 0.0001812211737420675, "loss": 1.7382, "step": 284 }, { "epoch": 0.16216216216216217, "grad_norm": 0.5192847847938538, "learning_rate": 0.00018108835270312488, "loss": 1.5809, "step": 285 }, { "epoch": 0.16273115220483642, "grad_norm": 0.5059441328048706, "learning_rate": 0.00018095511263723768, "loss": 1.3315, "step": 286 }, { "epoch": 0.16330014224751066, "grad_norm": 0.542091429233551, "learning_rate": 0.00018082145423292868, "loss": 1.394, "step": 287 }, { "epoch": 0.16386913229018493, "grad_norm": 0.5587398409843445, "learning_rate": 0.00018068737818088248, "loss": 1.5478, "step": 288 }, { "epoch": 0.16443812233285918, "grad_norm": 0.5091587901115417, "learning_rate": 0.00018055288517394174, "loss": 1.4298, "step": 289 }, { "epoch": 0.16500711237553342, "grad_norm": 0.5347201228141785, "learning_rate": 0.00018041797590710398, "loss": 1.4504, "step": 290 }, { "epoch": 0.1655761024182077, "grad_norm": 0.5370376110076904, "learning_rate": 0.00018028265107751756, "loss": 1.6061, "step": 291 }, { "epoch": 0.16614509246088194, "grad_norm": 0.5322532057762146, "learning_rate": 0.00018014691138447834, "loss": 1.5102, "step": 292 }, { "epoch": 0.16671408250355618, "grad_norm": 0.4970771074295044, "learning_rate": 0.00018001075752942605, "loss": 1.3017, "step": 293 }, { "epoch": 0.16728307254623045, "grad_norm": 0.5143032670021057, "learning_rate": 0.00017987419021594053, "loss": 1.5115, "step": 294 }, { "epoch": 0.1678520625889047, "grad_norm": 0.4978564977645874, "learning_rate": 0.00017973721014973823, "loss": 1.33, "step": 295 }, { "epoch": 0.16842105263157894, "grad_norm": 0.5085217356681824, "learning_rate": 0.00017959981803866856, "loss": 1.3251, "step": 296 }, { "epoch": 0.1689900426742532, "grad_norm": 0.522738516330719, "learning_rate": 0.0001794620145927101, "loss": 1.3305, "step": 297 }, { "epoch": 0.16955903271692746, "grad_norm": 0.506791353225708, "learning_rate": 0.00017932380052396702, "loss": 1.5626, "step": 298 }, { "epoch": 0.1701280227596017, "grad_norm": 0.541067898273468, "learning_rate": 0.0001791851765466655, "loss": 1.6446, "step": 299 }, { "epoch": 0.17069701280227595, "grad_norm": 0.5105940103530884, "learning_rate": 0.0001790461433771498, "loss": 1.5842, "step": 300 }, { "epoch": 0.17126600284495022, "grad_norm": 0.49997130036354065, "learning_rate": 0.00017890670173387885, "loss": 1.5844, "step": 301 }, { "epoch": 0.17183499288762447, "grad_norm": 0.5258059501647949, "learning_rate": 0.00017876685233742226, "loss": 1.5576, "step": 302 }, { "epoch": 0.1724039829302987, "grad_norm": 0.5664198398590088, "learning_rate": 0.00017862659591045673, "loss": 1.4313, "step": 303 }, { "epoch": 0.17297297297297298, "grad_norm": 0.5197086930274963, "learning_rate": 0.00017848593317776234, "loss": 1.4374, "step": 304 }, { "epoch": 0.17354196301564723, "grad_norm": 0.5377213954925537, "learning_rate": 0.0001783448648662188, "loss": 1.3973, "step": 305 }, { "epoch": 0.17411095305832147, "grad_norm": 0.4912850260734558, "learning_rate": 0.00017820339170480156, "loss": 1.3055, "step": 306 }, { "epoch": 0.17467994310099574, "grad_norm": 0.5148215293884277, "learning_rate": 0.00017806151442457827, "loss": 1.5493, "step": 307 }, { "epoch": 0.17524893314367, "grad_norm": 0.5305980443954468, "learning_rate": 0.0001779192337587048, "loss": 1.6176, "step": 308 }, { "epoch": 0.17581792318634423, "grad_norm": 0.5322251319885254, "learning_rate": 0.0001777765504424215, "loss": 1.6621, "step": 309 }, { "epoch": 0.1763869132290185, "grad_norm": 0.5405860543251038, "learning_rate": 0.00017763346521304955, "loss": 1.5951, "step": 310 }, { "epoch": 0.17695590327169275, "grad_norm": 0.5762712359428406, "learning_rate": 0.00017748997880998691, "loss": 1.4609, "step": 311 }, { "epoch": 0.177524893314367, "grad_norm": 0.5313809514045715, "learning_rate": 0.0001773460919747047, "loss": 1.4488, "step": 312 }, { "epoch": 0.17809388335704124, "grad_norm": 0.5385677814483643, "learning_rate": 0.00017720180545074322, "loss": 1.5543, "step": 313 }, { "epoch": 0.1786628733997155, "grad_norm": 0.5349786877632141, "learning_rate": 0.00017705711998370824, "loss": 1.5848, "step": 314 }, { "epoch": 0.17923186344238975, "grad_norm": 0.5395460724830627, "learning_rate": 0.00017691203632126706, "loss": 1.5344, "step": 315 }, { "epoch": 0.179800853485064, "grad_norm": 0.5073065757751465, "learning_rate": 0.0001767665552131446, "loss": 1.4227, "step": 316 }, { "epoch": 0.18036984352773827, "grad_norm": 0.5242070555686951, "learning_rate": 0.00017662067741111974, "loss": 1.5054, "step": 317 }, { "epoch": 0.18093883357041252, "grad_norm": 0.5271447896957397, "learning_rate": 0.00017647440366902117, "loss": 1.5675, "step": 318 }, { "epoch": 0.18150782361308676, "grad_norm": 0.5302979946136475, "learning_rate": 0.00017632773474272363, "loss": 1.4631, "step": 319 }, { "epoch": 0.18207681365576103, "grad_norm": 0.5438220500946045, "learning_rate": 0.00017618067139014404, "loss": 1.4737, "step": 320 }, { "epoch": 0.18264580369843528, "grad_norm": 0.5002385377883911, "learning_rate": 0.0001760332143712375, "loss": 1.3976, "step": 321 }, { "epoch": 0.18321479374110952, "grad_norm": 0.5478991866111755, "learning_rate": 0.00017588536444799338, "loss": 1.527, "step": 322 }, { "epoch": 0.1837837837837838, "grad_norm": 0.5406285524368286, "learning_rate": 0.0001757371223844314, "loss": 1.4453, "step": 323 }, { "epoch": 0.18435277382645804, "grad_norm": 0.5226593613624573, "learning_rate": 0.00017558848894659771, "loss": 1.5309, "step": 324 }, { "epoch": 0.18492176386913228, "grad_norm": 0.5488921999931335, "learning_rate": 0.0001754394649025609, "loss": 1.6993, "step": 325 }, { "epoch": 0.18549075391180656, "grad_norm": 0.5268238186836243, "learning_rate": 0.000175290051022408, "loss": 1.4578, "step": 326 }, { "epoch": 0.1860597439544808, "grad_norm": 0.5236526727676392, "learning_rate": 0.00017514024807824055, "loss": 1.5276, "step": 327 }, { "epoch": 0.18662873399715504, "grad_norm": 0.5280612707138062, "learning_rate": 0.00017499005684417057, "loss": 1.5191, "step": 328 }, { "epoch": 0.18719772403982932, "grad_norm": 0.5311048030853271, "learning_rate": 0.0001748394780963166, "loss": 1.6317, "step": 329 }, { "epoch": 0.18776671408250356, "grad_norm": 0.5343871712684631, "learning_rate": 0.0001746885126127997, "loss": 1.6759, "step": 330 }, { "epoch": 0.1883357041251778, "grad_norm": 0.5824495553970337, "learning_rate": 0.00017453716117373937, "loss": 1.5064, "step": 331 }, { "epoch": 0.18890469416785205, "grad_norm": 0.5165912508964539, "learning_rate": 0.0001743854245612495, "loss": 1.413, "step": 332 }, { "epoch": 0.18947368421052632, "grad_norm": 0.5721679329872131, "learning_rate": 0.0001742333035594345, "loss": 1.3518, "step": 333 }, { "epoch": 0.19004267425320057, "grad_norm": 0.5547354817390442, "learning_rate": 0.00017408079895438498, "loss": 1.7325, "step": 334 }, { "epoch": 0.1906116642958748, "grad_norm": 0.5567200779914856, "learning_rate": 0.00017392791153417398, "loss": 1.6179, "step": 335 }, { "epoch": 0.19118065433854908, "grad_norm": 0.5186401009559631, "learning_rate": 0.00017377464208885265, "loss": 1.3499, "step": 336 }, { "epoch": 0.19174964438122333, "grad_norm": 0.5111268758773804, "learning_rate": 0.00017362099141044626, "loss": 1.2942, "step": 337 }, { "epoch": 0.19231863442389757, "grad_norm": 0.5359705090522766, "learning_rate": 0.0001734669602929502, "loss": 1.552, "step": 338 }, { "epoch": 0.19288762446657184, "grad_norm": 0.5835704803466797, "learning_rate": 0.0001733125495323257, "loss": 1.3161, "step": 339 }, { "epoch": 0.1934566145092461, "grad_norm": 0.5223122835159302, "learning_rate": 0.00017315775992649584, "loss": 1.5189, "step": 340 }, { "epoch": 0.19402560455192033, "grad_norm": 0.5331559777259827, "learning_rate": 0.0001730025922753415, "loss": 1.7263, "step": 341 }, { "epoch": 0.1945945945945946, "grad_norm": 0.54593425989151, "learning_rate": 0.00017284704738069698, "loss": 1.5158, "step": 342 }, { "epoch": 0.19516358463726885, "grad_norm": 0.5385016202926636, "learning_rate": 0.000172691126046346, "loss": 1.5762, "step": 343 }, { "epoch": 0.1957325746799431, "grad_norm": 0.4981791079044342, "learning_rate": 0.00017253482907801773, "loss": 1.3606, "step": 344 }, { "epoch": 0.19630156472261737, "grad_norm": 0.5046445727348328, "learning_rate": 0.00017237815728338217, "loss": 1.382, "step": 345 }, { "epoch": 0.1968705547652916, "grad_norm": 0.5692354440689087, "learning_rate": 0.00017222111147204645, "loss": 1.6214, "step": 346 }, { "epoch": 0.19743954480796586, "grad_norm": 0.5191353559494019, "learning_rate": 0.00017206369245555036, "loss": 1.459, "step": 347 }, { "epoch": 0.1980085348506401, "grad_norm": 0.5159747004508972, "learning_rate": 0.0001719059010473623, "loss": 1.6057, "step": 348 }, { "epoch": 0.1980085348506401, "eval_loss": 1.506325602531433, "eval_runtime": 16.4362, "eval_samples_per_second": 45.023, "eval_steps_per_second": 22.511, "step": 348 }, { "epoch": 0.19857752489331437, "grad_norm": 0.5306143164634705, "learning_rate": 0.00017174773806287496, "loss": 1.5776, "step": 349 }, { "epoch": 0.19914651493598862, "grad_norm": 0.5569584369659424, "learning_rate": 0.00017158920431940117, "loss": 1.5926, "step": 350 }, { "epoch": 0.19971550497866286, "grad_norm": 0.5538038611412048, "learning_rate": 0.0001714303006361697, "loss": 1.6146, "step": 351 }, { "epoch": 0.20028449502133713, "grad_norm": 0.5369197130203247, "learning_rate": 0.00017127102783432097, "loss": 1.514, "step": 352 }, { "epoch": 0.20085348506401138, "grad_norm": 0.6111621856689453, "learning_rate": 0.00017111138673690283, "loss": 1.3508, "step": 353 }, { "epoch": 0.20142247510668562, "grad_norm": 0.5350061655044556, "learning_rate": 0.0001709513781688664, "loss": 1.5506, "step": 354 }, { "epoch": 0.2019914651493599, "grad_norm": 0.5226223468780518, "learning_rate": 0.00017079100295706154, "loss": 1.55, "step": 355 }, { "epoch": 0.20256045519203414, "grad_norm": 0.5834634304046631, "learning_rate": 0.0001706302619302329, "loss": 1.6025, "step": 356 }, { "epoch": 0.20312944523470838, "grad_norm": 0.564756453037262, "learning_rate": 0.0001704691559190155, "loss": 1.5174, "step": 357 }, { "epoch": 0.20369843527738266, "grad_norm": 0.5217262506484985, "learning_rate": 0.00017030768575593025, "loss": 1.4321, "step": 358 }, { "epoch": 0.2042674253200569, "grad_norm": 0.5270060896873474, "learning_rate": 0.0001701458522753801, "loss": 1.6006, "step": 359 }, { "epoch": 0.20483641536273114, "grad_norm": 0.5722881555557251, "learning_rate": 0.00016998365631364527, "loss": 1.7025, "step": 360 }, { "epoch": 0.20540540540540542, "grad_norm": 0.5267907977104187, "learning_rate": 0.00016982109870887908, "loss": 1.5108, "step": 361 }, { "epoch": 0.20597439544807966, "grad_norm": 0.5428017973899841, "learning_rate": 0.00016965818030110382, "loss": 1.6343, "step": 362 }, { "epoch": 0.2065433854907539, "grad_norm": 0.5151480436325073, "learning_rate": 0.0001694949019322061, "loss": 1.5242, "step": 363 }, { "epoch": 0.20711237553342818, "grad_norm": 0.5217251181602478, "learning_rate": 0.00016933126444593273, "loss": 1.54, "step": 364 }, { "epoch": 0.20768136557610242, "grad_norm": 0.5215661525726318, "learning_rate": 0.00016916726868788622, "loss": 1.5131, "step": 365 }, { "epoch": 0.20825035561877667, "grad_norm": 0.5087475776672363, "learning_rate": 0.00016900291550552048, "loss": 1.6782, "step": 366 }, { "epoch": 0.2088193456614509, "grad_norm": 0.5366347432136536, "learning_rate": 0.0001688382057481364, "loss": 1.5821, "step": 367 }, { "epoch": 0.20938833570412518, "grad_norm": 0.5469174385070801, "learning_rate": 0.00016867314026687753, "loss": 1.8795, "step": 368 }, { "epoch": 0.20995732574679943, "grad_norm": 0.5702829957008362, "learning_rate": 0.00016850771991472563, "loss": 1.4382, "step": 369 }, { "epoch": 0.21052631578947367, "grad_norm": 0.5792803764343262, "learning_rate": 0.0001683419455464962, "loss": 1.6934, "step": 370 }, { "epoch": 0.21109530583214794, "grad_norm": 0.5423445701599121, "learning_rate": 0.0001681758180188342, "loss": 1.5408, "step": 371 }, { "epoch": 0.2116642958748222, "grad_norm": 0.5211445093154907, "learning_rate": 0.00016800933819020956, "loss": 1.5354, "step": 372 }, { "epoch": 0.21223328591749643, "grad_norm": 0.5631567239761353, "learning_rate": 0.0001678425069209127, "loss": 1.6356, "step": 373 }, { "epoch": 0.2128022759601707, "grad_norm": 0.5736171007156372, "learning_rate": 0.0001676753250730501, "loss": 1.6202, "step": 374 }, { "epoch": 0.21337126600284495, "grad_norm": 0.5194095373153687, "learning_rate": 0.00016750779351053994, "loss": 1.4419, "step": 375 }, { "epoch": 0.2139402560455192, "grad_norm": 0.5220928192138672, "learning_rate": 0.0001673399130991075, "loss": 1.4182, "step": 376 }, { "epoch": 0.21450924608819347, "grad_norm": 0.5223848819732666, "learning_rate": 0.00016717168470628077, "loss": 1.5831, "step": 377 }, { "epoch": 0.2150782361308677, "grad_norm": 0.5400263071060181, "learning_rate": 0.00016700310920138596, "loss": 1.579, "step": 378 }, { "epoch": 0.21564722617354196, "grad_norm": 0.5276429653167725, "learning_rate": 0.00016683418745554299, "loss": 1.4674, "step": 379 }, { "epoch": 0.21621621621621623, "grad_norm": 0.5498270392417908, "learning_rate": 0.000166664920341661, "loss": 1.8171, "step": 380 }, { "epoch": 0.21678520625889047, "grad_norm": 0.5207138657569885, "learning_rate": 0.00016649530873443375, "loss": 1.3337, "step": 381 }, { "epoch": 0.21735419630156472, "grad_norm": 0.5555972456932068, "learning_rate": 0.00016632535351033533, "loss": 1.5634, "step": 382 }, { "epoch": 0.217923186344239, "grad_norm": 0.5569733381271362, "learning_rate": 0.00016615505554761533, "loss": 1.6649, "step": 383 }, { "epoch": 0.21849217638691323, "grad_norm": 0.5526515245437622, "learning_rate": 0.00016598441572629458, "loss": 1.2708, "step": 384 }, { "epoch": 0.21906116642958748, "grad_norm": 0.5405237674713135, "learning_rate": 0.0001658134349281604, "loss": 1.5085, "step": 385 }, { "epoch": 0.21963015647226172, "grad_norm": 0.5164327621459961, "learning_rate": 0.00016564211403676213, "loss": 1.4096, "step": 386 }, { "epoch": 0.220199146514936, "grad_norm": 0.535915195941925, "learning_rate": 0.0001654704539374066, "loss": 1.5407, "step": 387 }, { "epoch": 0.22076813655761024, "grad_norm": 0.5589139461517334, "learning_rate": 0.0001652984555171534, "loss": 1.5837, "step": 388 }, { "epoch": 0.22133712660028448, "grad_norm": 0.5141209959983826, "learning_rate": 0.00016512611966481056, "loss": 1.377, "step": 389 }, { "epoch": 0.22190611664295876, "grad_norm": 0.514789879322052, "learning_rate": 0.00016495344727092973, "loss": 1.5191, "step": 390 }, { "epoch": 0.222475106685633, "grad_norm": 0.5353395342826843, "learning_rate": 0.00016478043922780157, "loss": 1.5026, "step": 391 }, { "epoch": 0.22304409672830725, "grad_norm": 0.5318089127540588, "learning_rate": 0.00016460709642945133, "loss": 1.5277, "step": 392 }, { "epoch": 0.22361308677098152, "grad_norm": 0.5722904205322266, "learning_rate": 0.00016443341977163408, "loss": 1.3433, "step": 393 }, { "epoch": 0.22418207681365576, "grad_norm": 0.542008101940155, "learning_rate": 0.0001642594101518301, "loss": 1.5241, "step": 394 }, { "epoch": 0.22475106685633, "grad_norm": 0.5351589918136597, "learning_rate": 0.00016408506846924035, "loss": 1.6335, "step": 395 }, { "epoch": 0.22532005689900428, "grad_norm": 0.5150931477546692, "learning_rate": 0.00016391039562478157, "loss": 1.5412, "step": 396 }, { "epoch": 0.22588904694167852, "grad_norm": 0.5498356819152832, "learning_rate": 0.00016373539252108202, "loss": 1.5062, "step": 397 }, { "epoch": 0.22645803698435277, "grad_norm": 0.5373052358627319, "learning_rate": 0.0001635600600624763, "loss": 1.6658, "step": 398 }, { "epoch": 0.22702702702702704, "grad_norm": 0.5198200941085815, "learning_rate": 0.00016338439915500127, "loss": 1.3554, "step": 399 }, { "epoch": 0.22759601706970128, "grad_norm": 0.5517953038215637, "learning_rate": 0.00016320841070639083, "loss": 1.5403, "step": 400 }, { "epoch": 0.22816500711237553, "grad_norm": 0.5407613515853882, "learning_rate": 0.00016303209562607154, "loss": 1.5033, "step": 401 }, { "epoch": 0.22873399715504977, "grad_norm": 0.5271732211112976, "learning_rate": 0.00016285545482515792, "loss": 1.4554, "step": 402 }, { "epoch": 0.22930298719772405, "grad_norm": 0.5387139916419983, "learning_rate": 0.0001626784892164475, "loss": 1.7347, "step": 403 }, { "epoch": 0.2298719772403983, "grad_norm": 0.5222678780555725, "learning_rate": 0.00016250119971441637, "loss": 1.4489, "step": 404 }, { "epoch": 0.23044096728307253, "grad_norm": 0.5498174428939819, "learning_rate": 0.00016232358723521436, "loss": 1.6047, "step": 405 }, { "epoch": 0.2310099573257468, "grad_norm": 0.5119244456291199, "learning_rate": 0.0001621456526966603, "loss": 1.5818, "step": 406 }, { "epoch": 0.23157894736842105, "grad_norm": 0.5584565997123718, "learning_rate": 0.00016196739701823716, "loss": 1.6863, "step": 407 }, { "epoch": 0.2321479374110953, "grad_norm": 0.5125292539596558, "learning_rate": 0.00016178882112108752, "loss": 1.4137, "step": 408 }, { "epoch": 0.23271692745376957, "grad_norm": 0.518551230430603, "learning_rate": 0.00016160992592800872, "loss": 1.304, "step": 409 }, { "epoch": 0.2332859174964438, "grad_norm": 0.5396437048912048, "learning_rate": 0.00016143071236344797, "loss": 1.6118, "step": 410 }, { "epoch": 0.23385490753911806, "grad_norm": 0.6036053895950317, "learning_rate": 0.0001612511813534978, "loss": 1.5618, "step": 411 }, { "epoch": 0.23442389758179233, "grad_norm": 0.5274645686149597, "learning_rate": 0.00016107133382589105, "loss": 1.5238, "step": 412 }, { "epoch": 0.23499288762446657, "grad_norm": 0.5649259090423584, "learning_rate": 0.00016089117070999616, "loss": 1.4841, "step": 413 }, { "epoch": 0.23556187766714082, "grad_norm": 0.5350419282913208, "learning_rate": 0.0001607106929368125, "loss": 1.4252, "step": 414 }, { "epoch": 0.2361308677098151, "grad_norm": 0.5421844124794006, "learning_rate": 0.00016052990143896535, "loss": 1.3899, "step": 415 }, { "epoch": 0.23669985775248933, "grad_norm": 0.5462636947631836, "learning_rate": 0.0001603487971507012, "loss": 1.6417, "step": 416 }, { "epoch": 0.23726884779516358, "grad_norm": 0.564430832862854, "learning_rate": 0.00016016738100788297, "loss": 1.6418, "step": 417 }, { "epoch": 0.23783783783783785, "grad_norm": 0.5399342179298401, "learning_rate": 0.00015998565394798492, "loss": 1.3624, "step": 418 }, { "epoch": 0.2384068278805121, "grad_norm": 0.5136001706123352, "learning_rate": 0.00015980361691008815, "loss": 1.3956, "step": 419 }, { "epoch": 0.23897581792318634, "grad_norm": 0.5325256586074829, "learning_rate": 0.00015962127083487548, "loss": 1.2396, "step": 420 }, { "epoch": 0.23954480796586058, "grad_norm": 0.5132279396057129, "learning_rate": 0.00015943861666462675, "loss": 1.4461, "step": 421 }, { "epoch": 0.24011379800853486, "grad_norm": 0.5597640872001648, "learning_rate": 0.0001592556553432139, "loss": 1.5031, "step": 422 }, { "epoch": 0.2406827880512091, "grad_norm": 0.5563086271286011, "learning_rate": 0.00015907238781609606, "loss": 1.4839, "step": 423 }, { "epoch": 0.24125177809388335, "grad_norm": 0.557904839515686, "learning_rate": 0.00015888881503031468, "loss": 1.6277, "step": 424 }, { "epoch": 0.24182076813655762, "grad_norm": 0.5795301198959351, "learning_rate": 0.00015870493793448864, "loss": 1.4073, "step": 425 }, { "epoch": 0.24238975817923186, "grad_norm": 0.5133345127105713, "learning_rate": 0.00015852075747880938, "loss": 1.3689, "step": 426 }, { "epoch": 0.2429587482219061, "grad_norm": 0.5455712676048279, "learning_rate": 0.00015833627461503595, "loss": 1.6118, "step": 427 }, { "epoch": 0.24352773826458038, "grad_norm": 0.5585681796073914, "learning_rate": 0.00015815149029649013, "loss": 1.5628, "step": 428 }, { "epoch": 0.24409672830725462, "grad_norm": 0.5475082397460938, "learning_rate": 0.0001579664054780514, "loss": 1.5907, "step": 429 }, { "epoch": 0.24466571834992887, "grad_norm": 0.530405580997467, "learning_rate": 0.0001577810211161522, "loss": 1.5324, "step": 430 }, { "epoch": 0.24523470839260314, "grad_norm": 0.5662998557090759, "learning_rate": 0.00015759533816877275, "loss": 1.2456, "step": 431 }, { "epoch": 0.24580369843527738, "grad_norm": 0.6249381303787231, "learning_rate": 0.0001574093575954363, "loss": 1.4694, "step": 432 }, { "epoch": 0.24637268847795163, "grad_norm": 0.5382659435272217, "learning_rate": 0.00015722308035720408, "loss": 1.6025, "step": 433 }, { "epoch": 0.2469416785206259, "grad_norm": 0.5415714383125305, "learning_rate": 0.00015703650741667036, "loss": 1.3643, "step": 434 }, { "epoch": 0.24751066856330015, "grad_norm": 0.540256917476654, "learning_rate": 0.0001568496397379574, "loss": 1.4577, "step": 435 }, { "epoch": 0.2480796586059744, "grad_norm": 0.5126465559005737, "learning_rate": 0.0001566624782867106, "loss": 1.5512, "step": 436 }, { "epoch": 0.24864864864864866, "grad_norm": 0.5520801544189453, "learning_rate": 0.0001564750240300934, "loss": 1.6545, "step": 437 }, { "epoch": 0.2492176386913229, "grad_norm": 0.5290027260780334, "learning_rate": 0.00015628727793678233, "loss": 1.5391, "step": 438 }, { "epoch": 0.24978662873399715, "grad_norm": 0.5835967659950256, "learning_rate": 0.00015609924097696203, "loss": 1.4657, "step": 439 }, { "epoch": 0.2503556187766714, "grad_norm": 0.5586689710617065, "learning_rate": 0.00015591091412232012, "loss": 1.5222, "step": 440 }, { "epoch": 0.25092460881934564, "grad_norm": 0.5292929410934448, "learning_rate": 0.00015572229834604235, "loss": 1.4726, "step": 441 }, { "epoch": 0.25149359886201994, "grad_norm": 0.5165523290634155, "learning_rate": 0.00015553339462280748, "loss": 1.4154, "step": 442 }, { "epoch": 0.2520625889046942, "grad_norm": 0.5475851893424988, "learning_rate": 0.00015534420392878211, "loss": 1.5885, "step": 443 }, { "epoch": 0.25263157894736843, "grad_norm": 0.5540974736213684, "learning_rate": 0.00015515472724161598, "loss": 1.4529, "step": 444 }, { "epoch": 0.2532005689900427, "grad_norm": 0.5251240730285645, "learning_rate": 0.00015496496554043653, "loss": 1.3794, "step": 445 }, { "epoch": 0.2537695590327169, "grad_norm": 0.5751416683197021, "learning_rate": 0.00015477491980584417, "loss": 1.5417, "step": 446 }, { "epoch": 0.25433854907539116, "grad_norm": 0.5411546230316162, "learning_rate": 0.00015458459101990693, "loss": 1.6787, "step": 447 }, { "epoch": 0.2549075391180654, "grad_norm": 0.5817191004753113, "learning_rate": 0.00015439398016615558, "loss": 1.5382, "step": 448 }, { "epoch": 0.2554765291607397, "grad_norm": 0.505901038646698, "learning_rate": 0.00015420308822957848, "loss": 1.3885, "step": 449 }, { "epoch": 0.25604551920341395, "grad_norm": 0.5091856718063354, "learning_rate": 0.00015401191619661658, "loss": 1.4067, "step": 450 }, { "epoch": 0.2566145092460882, "grad_norm": 0.5677408576011658, "learning_rate": 0.00015382046505515803, "loss": 1.5578, "step": 451 }, { "epoch": 0.25718349928876244, "grad_norm": 0.5270281434059143, "learning_rate": 0.00015362873579453348, "loss": 1.3921, "step": 452 }, { "epoch": 0.2577524893314367, "grad_norm": 0.5784454345703125, "learning_rate": 0.00015343672940551067, "loss": 1.5433, "step": 453 }, { "epoch": 0.25832147937411093, "grad_norm": 0.5490661859512329, "learning_rate": 0.00015324444688028947, "loss": 1.4543, "step": 454 }, { "epoch": 0.25889046941678523, "grad_norm": 0.5555963516235352, "learning_rate": 0.00015305188921249665, "loss": 1.3882, "step": 455 }, { "epoch": 0.2594594594594595, "grad_norm": 0.5918729305267334, "learning_rate": 0.0001528590573971808, "loss": 1.6544, "step": 456 }, { "epoch": 0.2600284495021337, "grad_norm": 0.5301398038864136, "learning_rate": 0.00015266595243080714, "loss": 1.6201, "step": 457 }, { "epoch": 0.26059743954480796, "grad_norm": 0.5327576994895935, "learning_rate": 0.0001524725753112525, "loss": 1.6861, "step": 458 }, { "epoch": 0.2611664295874822, "grad_norm": 0.5090361833572388, "learning_rate": 0.00015227892703780003, "loss": 1.2298, "step": 459 }, { "epoch": 0.26173541963015645, "grad_norm": 0.5667193531990051, "learning_rate": 0.00015208500861113401, "loss": 1.4061, "step": 460 }, { "epoch": 0.26230440967283075, "grad_norm": 0.5170226097106934, "learning_rate": 0.00015189082103333484, "loss": 1.3402, "step": 461 }, { "epoch": 0.262873399715505, "grad_norm": 0.5260865688323975, "learning_rate": 0.0001516963653078737, "loss": 1.4571, "step": 462 }, { "epoch": 0.26344238975817924, "grad_norm": 0.5484414100646973, "learning_rate": 0.00015150164243960752, "loss": 1.4822, "step": 463 }, { "epoch": 0.2640113798008535, "grad_norm": 0.5555655360221863, "learning_rate": 0.00015130665343477358, "loss": 1.4383, "step": 464 }, { "epoch": 0.26458036984352773, "grad_norm": 0.5628737211227417, "learning_rate": 0.0001511113993009845, "loss": 1.6092, "step": 465 }, { "epoch": 0.265149359886202, "grad_norm": 0.5401899814605713, "learning_rate": 0.00015091588104722297, "loss": 1.4347, "step": 466 }, { "epoch": 0.2657183499288762, "grad_norm": 0.5575911998748779, "learning_rate": 0.00015072009968383656, "loss": 1.6627, "step": 467 }, { "epoch": 0.2662873399715505, "grad_norm": 0.539851725101471, "learning_rate": 0.00015052405622253235, "loss": 1.5648, "step": 468 }, { "epoch": 0.26685633001422476, "grad_norm": 0.5497231483459473, "learning_rate": 0.00015032775167637193, "loss": 1.5671, "step": 469 }, { "epoch": 0.267425320056899, "grad_norm": 0.5294174551963806, "learning_rate": 0.00015013118705976602, "loss": 1.4519, "step": 470 }, { "epoch": 0.26799431009957325, "grad_norm": 0.5508366227149963, "learning_rate": 0.00014993436338846925, "loss": 1.2089, "step": 471 }, { "epoch": 0.2685633001422475, "grad_norm": 0.530941903591156, "learning_rate": 0.00014973728167957498, "loss": 1.2298, "step": 472 }, { "epoch": 0.26913229018492174, "grad_norm": 0.572995126247406, "learning_rate": 0.00014953994295150986, "loss": 1.5102, "step": 473 }, { "epoch": 0.26970128022759604, "grad_norm": 0.5313156843185425, "learning_rate": 0.00014934234822402883, "loss": 1.3345, "step": 474 }, { "epoch": 0.2702702702702703, "grad_norm": 0.5710895657539368, "learning_rate": 0.0001491444985182097, "loss": 1.4461, "step": 475 }, { "epoch": 0.27083926031294453, "grad_norm": 0.5655211210250854, "learning_rate": 0.00014894639485644784, "loss": 1.6591, "step": 476 }, { "epoch": 0.2714082503556188, "grad_norm": 0.5507573485374451, "learning_rate": 0.00014874803826245089, "loss": 1.3442, "step": 477 }, { "epoch": 0.271977240398293, "grad_norm": 0.5628292560577393, "learning_rate": 0.00014854942976123367, "loss": 1.6926, "step": 478 }, { "epoch": 0.27254623044096726, "grad_norm": 0.5278828740119934, "learning_rate": 0.00014835057037911268, "loss": 1.3193, "step": 479 }, { "epoch": 0.27311522048364156, "grad_norm": 0.550122857093811, "learning_rate": 0.0001481514611437008, "loss": 1.4085, "step": 480 }, { "epoch": 0.2736842105263158, "grad_norm": 0.5174803733825684, "learning_rate": 0.00014795210308390211, "loss": 1.2066, "step": 481 }, { "epoch": 0.27425320056899005, "grad_norm": 0.5421956777572632, "learning_rate": 0.00014775249722990646, "loss": 1.4261, "step": 482 }, { "epoch": 0.2748221906116643, "grad_norm": 0.5158098936080933, "learning_rate": 0.00014755264461318416, "loss": 1.277, "step": 483 }, { "epoch": 0.27539118065433854, "grad_norm": 0.5564343929290771, "learning_rate": 0.0001473525462664808, "loss": 1.5075, "step": 484 }, { "epoch": 0.2759601706970128, "grad_norm": 0.5485411882400513, "learning_rate": 0.0001471522032238116, "loss": 1.4847, "step": 485 }, { "epoch": 0.27652916073968703, "grad_norm": 0.5449703931808472, "learning_rate": 0.00014695161652045641, "loss": 1.6162, "step": 486 }, { "epoch": 0.27709815078236133, "grad_norm": 0.5641449093818665, "learning_rate": 0.00014675078719295415, "loss": 1.3614, "step": 487 }, { "epoch": 0.2776671408250356, "grad_norm": 0.5554978251457214, "learning_rate": 0.00014654971627909747, "loss": 1.5019, "step": 488 }, { "epoch": 0.2782361308677098, "grad_norm": 0.5530039668083191, "learning_rate": 0.0001463484048179275, "loss": 1.5116, "step": 489 }, { "epoch": 0.27880512091038406, "grad_norm": 0.5324894189834595, "learning_rate": 0.00014614685384972835, "loss": 1.3575, "step": 490 }, { "epoch": 0.2793741109530583, "grad_norm": 0.5472353100776672, "learning_rate": 0.0001459450644160218, "loss": 1.5364, "step": 491 }, { "epoch": 0.27994310099573255, "grad_norm": 0.5706241130828857, "learning_rate": 0.00014574303755956195, "loss": 1.5958, "step": 492 }, { "epoch": 0.28051209103840685, "grad_norm": 0.5553603768348694, "learning_rate": 0.00014554077432432975, "loss": 1.5664, "step": 493 }, { "epoch": 0.2810810810810811, "grad_norm": 0.542325496673584, "learning_rate": 0.00014533827575552766, "loss": 1.4275, "step": 494 }, { "epoch": 0.28165007112375534, "grad_norm": 0.6180648803710938, "learning_rate": 0.00014513554289957424, "loss": 1.3948, "step": 495 }, { "epoch": 0.2822190611664296, "grad_norm": 0.6009839177131653, "learning_rate": 0.0001449325768040987, "loss": 1.6545, "step": 496 }, { "epoch": 0.28278805120910383, "grad_norm": 0.58924800157547, "learning_rate": 0.00014472937851793557, "loss": 1.3284, "step": 497 }, { "epoch": 0.2833570412517781, "grad_norm": 0.5391841530799866, "learning_rate": 0.0001445259490911192, "loss": 1.3593, "step": 498 }, { "epoch": 0.2839260312944524, "grad_norm": 0.562134325504303, "learning_rate": 0.0001443222895748784, "loss": 1.4458, "step": 499 }, { "epoch": 0.2844950213371266, "grad_norm": 0.5663224458694458, "learning_rate": 0.000144118401021631, "loss": 1.5136, "step": 500 }, { "epoch": 0.28506401137980086, "grad_norm": 0.5762481689453125, "learning_rate": 0.00014391428448497825, "loss": 1.5841, "step": 501 }, { "epoch": 0.2856330014224751, "grad_norm": 0.5568172931671143, "learning_rate": 0.00014370994101969967, "loss": 1.5863, "step": 502 }, { "epoch": 0.28620199146514935, "grad_norm": 0.5461404323577881, "learning_rate": 0.00014350537168174738, "loss": 1.4175, "step": 503 }, { "epoch": 0.2867709815078236, "grad_norm": 0.5522152781486511, "learning_rate": 0.00014330057752824068, "loss": 1.5865, "step": 504 }, { "epoch": 0.28733997155049784, "grad_norm": 0.5333879590034485, "learning_rate": 0.00014309555961746067, "loss": 1.4804, "step": 505 }, { "epoch": 0.28790896159317214, "grad_norm": 0.5656757354736328, "learning_rate": 0.00014289031900884463, "loss": 1.4009, "step": 506 }, { "epoch": 0.2884779516358464, "grad_norm": 0.55275559425354, "learning_rate": 0.00014268485676298078, "loss": 1.3477, "step": 507 }, { "epoch": 0.28904694167852063, "grad_norm": 0.5528755784034729, "learning_rate": 0.00014247917394160254, "loss": 1.6965, "step": 508 }, { "epoch": 0.2896159317211949, "grad_norm": 0.5423591732978821, "learning_rate": 0.00014227327160758316, "loss": 1.3725, "step": 509 }, { "epoch": 0.2901849217638691, "grad_norm": 0.5610995292663574, "learning_rate": 0.00014206715082493032, "loss": 1.5135, "step": 510 }, { "epoch": 0.29075391180654336, "grad_norm": 0.550565242767334, "learning_rate": 0.00014186081265878047, "loss": 1.2824, "step": 511 }, { "epoch": 0.29132290184921766, "grad_norm": 0.5238208174705505, "learning_rate": 0.00014165425817539343, "loss": 1.3519, "step": 512 }, { "epoch": 0.2918918918918919, "grad_norm": 0.5561342835426331, "learning_rate": 0.00014144748844214684, "loss": 1.4381, "step": 513 }, { "epoch": 0.29246088193456615, "grad_norm": 0.5522477030754089, "learning_rate": 0.0001412405045275306, "loss": 1.5873, "step": 514 }, { "epoch": 0.2930298719772404, "grad_norm": 0.5491191744804382, "learning_rate": 0.0001410333075011415, "loss": 1.4527, "step": 515 }, { "epoch": 0.29359886201991464, "grad_norm": 0.5521331429481506, "learning_rate": 0.00014082589843367752, "loss": 1.6342, "step": 516 }, { "epoch": 0.2941678520625889, "grad_norm": 0.5632197856903076, "learning_rate": 0.0001406182783969324, "loss": 1.4758, "step": 517 }, { "epoch": 0.29473684210526313, "grad_norm": 0.5883782505989075, "learning_rate": 0.00014041044846379, "loss": 1.4963, "step": 518 }, { "epoch": 0.29530583214793743, "grad_norm": 0.5621269941329956, "learning_rate": 0.00014020240970821893, "loss": 1.6292, "step": 519 }, { "epoch": 0.2958748221906117, "grad_norm": 0.5850755572319031, "learning_rate": 0.00013999416320526685, "loss": 1.5853, "step": 520 }, { "epoch": 0.2964438122332859, "grad_norm": 0.5468763113021851, "learning_rate": 0.00013978571003105502, "loss": 1.4112, "step": 521 }, { "epoch": 0.29701280227596016, "grad_norm": 0.5954291820526123, "learning_rate": 0.00013957705126277253, "loss": 1.4785, "step": 522 }, { "epoch": 0.2975817923186344, "grad_norm": 0.5438716411590576, "learning_rate": 0.00013936818797867102, "loss": 1.6543, "step": 523 }, { "epoch": 0.29815078236130865, "grad_norm": 0.5444651246070862, "learning_rate": 0.00013915912125805893, "loss": 1.5327, "step": 524 }, { "epoch": 0.29871977240398295, "grad_norm": 0.5755301117897034, "learning_rate": 0.00013894985218129602, "loss": 1.5734, "step": 525 }, { "epoch": 0.2992887624466572, "grad_norm": 0.5267385244369507, "learning_rate": 0.0001387403818297876, "loss": 1.5172, "step": 526 }, { "epoch": 0.29985775248933144, "grad_norm": 0.5721412301063538, "learning_rate": 0.00013853071128597924, "loss": 1.617, "step": 527 }, { "epoch": 0.3004267425320057, "grad_norm": 0.547497570514679, "learning_rate": 0.00013832084163335084, "loss": 1.4242, "step": 528 }, { "epoch": 0.30099573257467993, "grad_norm": 0.5331338047981262, "learning_rate": 0.00013811077395641135, "loss": 1.2921, "step": 529 }, { "epoch": 0.3015647226173542, "grad_norm": 0.5468523502349854, "learning_rate": 0.00013790050934069296, "loss": 1.3264, "step": 530 }, { "epoch": 0.3021337126600285, "grad_norm": 0.538796067237854, "learning_rate": 0.00013769004887274547, "loss": 1.4284, "step": 531 }, { "epoch": 0.3027027027027027, "grad_norm": 0.5727618932723999, "learning_rate": 0.0001374793936401309, "loss": 1.509, "step": 532 }, { "epoch": 0.30327169274537696, "grad_norm": 0.5127109289169312, "learning_rate": 0.00013726854473141765, "loss": 1.3145, "step": 533 }, { "epoch": 0.3038406827880512, "grad_norm": 0.5412492156028748, "learning_rate": 0.00013705750323617495, "loss": 1.4385, "step": 534 }, { "epoch": 0.30440967283072545, "grad_norm": 0.6073004603385925, "learning_rate": 0.0001368462702449672, "loss": 1.585, "step": 535 }, { "epoch": 0.3049786628733997, "grad_norm": 0.6075984239578247, "learning_rate": 0.00013663484684934836, "loss": 1.6782, "step": 536 }, { "epoch": 0.30554765291607394, "grad_norm": 0.5950874090194702, "learning_rate": 0.0001364232341418564, "loss": 1.6634, "step": 537 }, { "epoch": 0.30611664295874824, "grad_norm": 0.5442619323730469, "learning_rate": 0.00013621143321600746, "loss": 1.6321, "step": 538 }, { "epoch": 0.3066856330014225, "grad_norm": 0.5568251609802246, "learning_rate": 0.00013599944516629045, "loss": 1.3718, "step": 539 }, { "epoch": 0.30725462304409673, "grad_norm": 0.5321120023727417, "learning_rate": 0.00013578727108816104, "loss": 1.3387, "step": 540 }, { "epoch": 0.307823613086771, "grad_norm": 0.6142572164535522, "learning_rate": 0.00013557491207803635, "loss": 1.4013, "step": 541 }, { "epoch": 0.3083926031294452, "grad_norm": 0.5809832811355591, "learning_rate": 0.0001353623692332891, "loss": 1.2896, "step": 542 }, { "epoch": 0.30896159317211946, "grad_norm": 0.5262885689735413, "learning_rate": 0.00013514964365224206, "loss": 1.4799, "step": 543 }, { "epoch": 0.30953058321479376, "grad_norm": 0.5609673261642456, "learning_rate": 0.00013493673643416218, "loss": 1.461, "step": 544 }, { "epoch": 0.310099573257468, "grad_norm": 0.5489050149917603, "learning_rate": 0.0001347236486792551, "loss": 1.3912, "step": 545 }, { "epoch": 0.31066856330014225, "grad_norm": 0.55717533826828, "learning_rate": 0.0001345103814886593, "loss": 1.4207, "step": 546 }, { "epoch": 0.3112375533428165, "grad_norm": 0.5326306819915771, "learning_rate": 0.00013429693596444067, "loss": 1.563, "step": 547 }, { "epoch": 0.31180654338549074, "grad_norm": 0.5783535838127136, "learning_rate": 0.00013408331320958648, "loss": 1.4829, "step": 548 }, { "epoch": 0.312375533428165, "grad_norm": 0.5628453493118286, "learning_rate": 0.00013386951432799987, "loss": 1.4815, "step": 549 }, { "epoch": 0.3129445234708393, "grad_norm": 0.5468215346336365, "learning_rate": 0.00013365554042449427, "loss": 1.3575, "step": 550 }, { "epoch": 0.31351351351351353, "grad_norm": 0.5711040496826172, "learning_rate": 0.00013344139260478732, "loss": 1.5833, "step": 551 }, { "epoch": 0.3140825035561878, "grad_norm": 0.5313072204589844, "learning_rate": 0.00013322707197549555, "loss": 1.5447, "step": 552 }, { "epoch": 0.314651493598862, "grad_norm": 0.6006999015808105, "learning_rate": 0.00013301257964412844, "loss": 1.747, "step": 553 }, { "epoch": 0.31522048364153626, "grad_norm": 0.6007615923881531, "learning_rate": 0.00013279791671908268, "loss": 1.5486, "step": 554 }, { "epoch": 0.3157894736842105, "grad_norm": 0.553854763507843, "learning_rate": 0.00013258308430963664, "loss": 1.4473, "step": 555 }, { "epoch": 0.31635846372688475, "grad_norm": 0.5920282006263733, "learning_rate": 0.00013236808352594433, "loss": 1.4883, "step": 556 }, { "epoch": 0.31692745376955905, "grad_norm": 0.5819621682167053, "learning_rate": 0.00013215291547903006, "loss": 1.4925, "step": 557 }, { "epoch": 0.3174964438122333, "grad_norm": 0.5728132128715515, "learning_rate": 0.0001319375812807823, "loss": 1.3921, "step": 558 }, { "epoch": 0.31806543385490754, "grad_norm": 0.6309751868247986, "learning_rate": 0.0001317220820439481, "loss": 1.6893, "step": 559 }, { "epoch": 0.3186344238975818, "grad_norm": 0.5545490384101868, "learning_rate": 0.00013150641888212756, "loss": 1.4053, "step": 560 }, { "epoch": 0.31920341394025603, "grad_norm": 0.5476984977722168, "learning_rate": 0.00013129059290976767, "loss": 1.3499, "step": 561 }, { "epoch": 0.3197724039829303, "grad_norm": 0.5255653262138367, "learning_rate": 0.00013107460524215678, "loss": 1.318, "step": 562 }, { "epoch": 0.3203413940256046, "grad_norm": 0.649142861366272, "learning_rate": 0.0001308584569954189, "loss": 1.6503, "step": 563 }, { "epoch": 0.3209103840682788, "grad_norm": 0.5934924483299255, "learning_rate": 0.0001306421492865077, "loss": 1.5933, "step": 564 }, { "epoch": 0.32147937411095306, "grad_norm": 0.5277055501937866, "learning_rate": 0.00013042568323320107, "loss": 1.4174, "step": 565 }, { "epoch": 0.3220483641536273, "grad_norm": 0.5566196441650391, "learning_rate": 0.00013020905995409497, "loss": 1.4713, "step": 566 }, { "epoch": 0.32261735419630155, "grad_norm": 0.5719363689422607, "learning_rate": 0.00012999228056859784, "loss": 1.5238, "step": 567 }, { "epoch": 0.3231863442389758, "grad_norm": 0.5720301866531372, "learning_rate": 0.00012977534619692494, "loss": 1.5374, "step": 568 }, { "epoch": 0.3237553342816501, "grad_norm": 0.5727265477180481, "learning_rate": 0.0001295582579600923, "loss": 1.4789, "step": 569 }, { "epoch": 0.32432432432432434, "grad_norm": 0.5553936958312988, "learning_rate": 0.00012934101697991115, "loss": 1.2535, "step": 570 }, { "epoch": 0.3248933143669986, "grad_norm": 0.5490901470184326, "learning_rate": 0.00012912362437898192, "loss": 1.4513, "step": 571 }, { "epoch": 0.32546230440967283, "grad_norm": 0.5691761374473572, "learning_rate": 0.0001289060812806886, "loss": 1.5947, "step": 572 }, { "epoch": 0.3260312944523471, "grad_norm": 0.5883947610855103, "learning_rate": 0.00012868838880919294, "loss": 1.3175, "step": 573 }, { "epoch": 0.3266002844950213, "grad_norm": 0.5340852737426758, "learning_rate": 0.00012847054808942847, "loss": 1.1903, "step": 574 }, { "epoch": 0.32716927453769556, "grad_norm": 0.5509372353553772, "learning_rate": 0.0001282525602470949, "loss": 1.5289, "step": 575 }, { "epoch": 0.32773826458036986, "grad_norm": 0.5860341191291809, "learning_rate": 0.00012803442640865208, "loss": 1.6618, "step": 576 }, { "epoch": 0.3283072546230441, "grad_norm": 0.540502667427063, "learning_rate": 0.00012781614770131442, "loss": 1.5062, "step": 577 }, { "epoch": 0.32887624466571835, "grad_norm": 0.5500742793083191, "learning_rate": 0.00012759772525304492, "loss": 1.6137, "step": 578 }, { "epoch": 0.3294452347083926, "grad_norm": 0.550717830657959, "learning_rate": 0.00012737916019254933, "loss": 1.6204, "step": 579 }, { "epoch": 0.33001422475106684, "grad_norm": 0.5424780249595642, "learning_rate": 0.00012716045364927035, "loss": 1.3499, "step": 580 }, { "epoch": 0.3305832147937411, "grad_norm": 0.5449280142784119, "learning_rate": 0.0001269416067533818, "loss": 1.518, "step": 581 }, { "epoch": 0.3311522048364154, "grad_norm": 0.5500824451446533, "learning_rate": 0.0001267226206357828, "loss": 1.6019, "step": 582 }, { "epoch": 0.33172119487908963, "grad_norm": 0.5455232262611389, "learning_rate": 0.00012650349642809197, "loss": 1.5048, "step": 583 }, { "epoch": 0.3322901849217639, "grad_norm": 0.5600374937057495, "learning_rate": 0.00012628423526264134, "loss": 1.4539, "step": 584 }, { "epoch": 0.3328591749644381, "grad_norm": 0.5611444115638733, "learning_rate": 0.0001260648382724708, "loss": 1.4871, "step": 585 }, { "epoch": 0.33342816500711236, "grad_norm": 0.5722511410713196, "learning_rate": 0.00012584530659132215, "loss": 1.4491, "step": 586 }, { "epoch": 0.3339971550497866, "grad_norm": 0.5913495421409607, "learning_rate": 0.00012562564135363313, "loss": 1.136, "step": 587 }, { "epoch": 0.3345661450924609, "grad_norm": 0.578739583492279, "learning_rate": 0.00012540584369453162, "loss": 1.3503, "step": 588 }, { "epoch": 0.33513513513513515, "grad_norm": 0.5618348717689514, "learning_rate": 0.00012518591474982985, "loss": 1.5827, "step": 589 }, { "epoch": 0.3357041251778094, "grad_norm": 0.5958595871925354, "learning_rate": 0.00012496585565601853, "loss": 1.6305, "step": 590 }, { "epoch": 0.33627311522048364, "grad_norm": 0.5362867116928101, "learning_rate": 0.00012474566755026073, "loss": 1.416, "step": 591 }, { "epoch": 0.3368421052631579, "grad_norm": 0.5598848462104797, "learning_rate": 0.00012452535157038641, "loss": 1.4456, "step": 592 }, { "epoch": 0.33741109530583213, "grad_norm": 0.5422506332397461, "learning_rate": 0.00012430490885488617, "loss": 1.3472, "step": 593 }, { "epoch": 0.3379800853485064, "grad_norm": 0.5901892781257629, "learning_rate": 0.00012408434054290561, "loss": 1.5748, "step": 594 }, { "epoch": 0.3385490753911807, "grad_norm": 0.5219245553016663, "learning_rate": 0.00012386364777423932, "loss": 1.3369, "step": 595 }, { "epoch": 0.3391180654338549, "grad_norm": 0.5885049104690552, "learning_rate": 0.00012364283168932495, "loss": 1.5212, "step": 596 }, { "epoch": 0.33968705547652916, "grad_norm": 0.5666311383247375, "learning_rate": 0.0001234218934292376, "loss": 1.5041, "step": 597 }, { "epoch": 0.3402560455192034, "grad_norm": 0.6065592765808105, "learning_rate": 0.0001232008341356835, "loss": 1.5489, "step": 598 }, { "epoch": 0.34082503556187765, "grad_norm": 0.6251218914985657, "learning_rate": 0.0001229796549509944, "loss": 1.5043, "step": 599 }, { "epoch": 0.3413940256045519, "grad_norm": 0.562077522277832, "learning_rate": 0.00012275835701812163, "loss": 1.547, "step": 600 }, { "epoch": 0.3419630156472262, "grad_norm": 0.5375682711601257, "learning_rate": 0.00012253694148063013, "loss": 1.3999, "step": 601 }, { "epoch": 0.34253200568990044, "grad_norm": 0.583003044128418, "learning_rate": 0.0001223154094826925, "loss": 1.641, "step": 602 }, { "epoch": 0.3431009957325747, "grad_norm": 0.619719922542572, "learning_rate": 0.00012209376216908328, "loss": 1.5772, "step": 603 }, { "epoch": 0.34366998577524893, "grad_norm": 0.5548385977745056, "learning_rate": 0.00012187200068517277, "loss": 1.4802, "step": 604 }, { "epoch": 0.3442389758179232, "grad_norm": 0.5717220902442932, "learning_rate": 0.00012165012617692143, "loss": 1.533, "step": 605 }, { "epoch": 0.3448079658605974, "grad_norm": 0.5915637016296387, "learning_rate": 0.00012142813979087356, "loss": 1.4618, "step": 606 }, { "epoch": 0.34537695590327167, "grad_norm": 0.5780906081199646, "learning_rate": 0.00012120604267415172, "loss": 1.428, "step": 607 }, { "epoch": 0.34594594594594597, "grad_norm": 0.6107869744300842, "learning_rate": 0.0001209838359744507, "loss": 1.6056, "step": 608 }, { "epoch": 0.3465149359886202, "grad_norm": 0.5807276368141174, "learning_rate": 0.0001207615208400315, "loss": 1.4344, "step": 609 }, { "epoch": 0.34708392603129445, "grad_norm": 0.5761096477508545, "learning_rate": 0.00012053909841971547, "loss": 1.6409, "step": 610 }, { "epoch": 0.3476529160739687, "grad_norm": 0.5648180246353149, "learning_rate": 0.00012031656986287835, "loss": 1.5207, "step": 611 }, { "epoch": 0.34822190611664294, "grad_norm": 0.5846616625785828, "learning_rate": 0.00012009393631944439, "loss": 1.709, "step": 612 }, { "epoch": 0.3487908961593172, "grad_norm": 0.5779747366905212, "learning_rate": 0.00011987119893988035, "loss": 1.5626, "step": 613 }, { "epoch": 0.3493598862019915, "grad_norm": 0.5634474158287048, "learning_rate": 0.00011964835887518955, "loss": 1.645, "step": 614 }, { "epoch": 0.34992887624466573, "grad_norm": 0.5536413788795471, "learning_rate": 0.00011942541727690593, "loss": 1.4927, "step": 615 }, { "epoch": 0.35049786628734, "grad_norm": 0.5312451720237732, "learning_rate": 0.00011920237529708811, "loss": 1.3328, "step": 616 }, { "epoch": 0.3510668563300142, "grad_norm": 0.5960412621498108, "learning_rate": 0.00011897923408831346, "loss": 1.5827, "step": 617 }, { "epoch": 0.35163584637268847, "grad_norm": 0.598399817943573, "learning_rate": 0.00011875599480367215, "loss": 1.5477, "step": 618 }, { "epoch": 0.3522048364153627, "grad_norm": 0.517993688583374, "learning_rate": 0.00011853265859676108, "loss": 1.3741, "step": 619 }, { "epoch": 0.352773826458037, "grad_norm": 0.5564917922019958, "learning_rate": 0.00011830922662167803, "loss": 1.3112, "step": 620 }, { "epoch": 0.35334281650071125, "grad_norm": 0.5626814961433411, "learning_rate": 0.00011808570003301566, "loss": 1.5272, "step": 621 }, { "epoch": 0.3539118065433855, "grad_norm": 0.6245387196540833, "learning_rate": 0.00011786207998585559, "loss": 1.433, "step": 622 }, { "epoch": 0.35448079658605974, "grad_norm": 0.5711420178413391, "learning_rate": 0.00011763836763576237, "loss": 1.4975, "step": 623 }, { "epoch": 0.355049786628734, "grad_norm": 0.5550587177276611, "learning_rate": 0.00011741456413877749, "loss": 1.3973, "step": 624 }, { "epoch": 0.35561877667140823, "grad_norm": 0.583817183971405, "learning_rate": 0.00011719067065141352, "loss": 1.4535, "step": 625 }, { "epoch": 0.3561877667140825, "grad_norm": 0.5912776589393616, "learning_rate": 0.00011696668833064795, "loss": 1.5161, "step": 626 }, { "epoch": 0.3567567567567568, "grad_norm": 0.615287184715271, "learning_rate": 0.0001167426183339174, "loss": 1.6331, "step": 627 }, { "epoch": 0.357325746799431, "grad_norm": 0.5431495308876038, "learning_rate": 0.00011651846181911161, "loss": 1.5279, "step": 628 }, { "epoch": 0.35789473684210527, "grad_norm": 0.5510687232017517, "learning_rate": 0.00011629421994456723, "loss": 1.5859, "step": 629 }, { "epoch": 0.3584637268847795, "grad_norm": 0.5746335983276367, "learning_rate": 0.0001160698938690622, "loss": 1.4053, "step": 630 }, { "epoch": 0.35903271692745375, "grad_norm": 0.5783334374427795, "learning_rate": 0.00011584548475180943, "loss": 1.6259, "step": 631 }, { "epoch": 0.359601706970128, "grad_norm": 0.5857696533203125, "learning_rate": 0.00011562099375245108, "loss": 1.4625, "step": 632 }, { "epoch": 0.3601706970128023, "grad_norm": 0.580596387386322, "learning_rate": 0.00011539642203105232, "loss": 1.511, "step": 633 }, { "epoch": 0.36073968705547654, "grad_norm": 0.5730242729187012, "learning_rate": 0.00011517177074809546, "loss": 1.6307, "step": 634 }, { "epoch": 0.3613086770981508, "grad_norm": 0.567469596862793, "learning_rate": 0.0001149470410644741, "loss": 1.5477, "step": 635 }, { "epoch": 0.36187766714082503, "grad_norm": 0.5704171061515808, "learning_rate": 0.00011472223414148675, "loss": 1.4716, "step": 636 }, { "epoch": 0.3624466571834993, "grad_norm": 0.5398246645927429, "learning_rate": 0.00011449735114083127, "loss": 1.6304, "step": 637 }, { "epoch": 0.3630156472261735, "grad_norm": 0.5576680898666382, "learning_rate": 0.0001142723932245985, "loss": 1.4775, "step": 638 }, { "epoch": 0.3635846372688478, "grad_norm": 0.5728341341018677, "learning_rate": 0.00011404736155526645, "loss": 1.6101, "step": 639 }, { "epoch": 0.36415362731152207, "grad_norm": 0.54744553565979, "learning_rate": 0.00011382225729569436, "loss": 1.2536, "step": 640 }, { "epoch": 0.3647226173541963, "grad_norm": 0.5593659281730652, "learning_rate": 0.00011359708160911641, "loss": 1.4138, "step": 641 }, { "epoch": 0.36529160739687055, "grad_norm": 0.5415304899215698, "learning_rate": 0.00011337183565913599, "loss": 1.5221, "step": 642 }, { "epoch": 0.3658605974395448, "grad_norm": 0.5653886198997498, "learning_rate": 0.00011314652060971955, "loss": 1.5221, "step": 643 }, { "epoch": 0.36642958748221904, "grad_norm": 0.5842243432998657, "learning_rate": 0.00011292113762519061, "loss": 1.501, "step": 644 }, { "epoch": 0.3669985775248933, "grad_norm": 0.5919954180717468, "learning_rate": 0.00011269568787022376, "loss": 1.5444, "step": 645 }, { "epoch": 0.3675675675675676, "grad_norm": 0.5867476463317871, "learning_rate": 0.00011247017250983865, "loss": 1.4897, "step": 646 }, { "epoch": 0.36813655761024183, "grad_norm": 0.5661168098449707, "learning_rate": 0.00011224459270939384, "loss": 1.3373, "step": 647 }, { "epoch": 0.3687055476529161, "grad_norm": 0.5516852736473083, "learning_rate": 0.00011201894963458106, "loss": 1.6209, "step": 648 }, { "epoch": 0.3692745376955903, "grad_norm": 0.615533709526062, "learning_rate": 0.00011179324445141883, "loss": 1.369, "step": 649 }, { "epoch": 0.36984352773826457, "grad_norm": 0.5543255805969238, "learning_rate": 0.00011156747832624679, "loss": 1.3172, "step": 650 }, { "epoch": 0.3704125177809388, "grad_norm": 0.5759336352348328, "learning_rate": 0.00011134165242571938, "loss": 1.5896, "step": 651 }, { "epoch": 0.3709815078236131, "grad_norm": 0.5587149858474731, "learning_rate": 0.00011111576791679994, "loss": 1.5963, "step": 652 }, { "epoch": 0.37155049786628735, "grad_norm": 0.5666396617889404, "learning_rate": 0.00011088982596675475, "loss": 1.5253, "step": 653 }, { "epoch": 0.3721194879089616, "grad_norm": 0.5888431668281555, "learning_rate": 0.00011066382774314683, "loss": 1.4419, "step": 654 }, { "epoch": 0.37268847795163584, "grad_norm": 0.5519063472747803, "learning_rate": 0.00011043777441383006, "loss": 1.5396, "step": 655 }, { "epoch": 0.3732574679943101, "grad_norm": 0.5812383890151978, "learning_rate": 0.00011021166714694297, "loss": 1.2045, "step": 656 }, { "epoch": 0.37382645803698433, "grad_norm": 0.5881744623184204, "learning_rate": 0.000109985507110903, "loss": 1.4078, "step": 657 }, { "epoch": 0.37439544807965863, "grad_norm": 0.5681930184364319, "learning_rate": 0.00010975929547440016, "loss": 1.4739, "step": 658 }, { "epoch": 0.3749644381223329, "grad_norm": 0.5596330165863037, "learning_rate": 0.0001095330334063911, "loss": 1.4085, "step": 659 }, { "epoch": 0.3755334281650071, "grad_norm": 0.5785601139068604, "learning_rate": 0.00010930672207609306, "loss": 1.4087, "step": 660 }, { "epoch": 0.37610241820768137, "grad_norm": 0.5467891097068787, "learning_rate": 0.00010908036265297794, "loss": 1.6924, "step": 661 }, { "epoch": 0.3766714082503556, "grad_norm": 0.5449764132499695, "learning_rate": 0.00010885395630676607, "loss": 1.5254, "step": 662 }, { "epoch": 0.37724039829302985, "grad_norm": 0.5570394396781921, "learning_rate": 0.00010862750420742031, "loss": 1.4218, "step": 663 }, { "epoch": 0.3778093883357041, "grad_norm": 0.5946861505508423, "learning_rate": 0.00010840100752513996, "loss": 1.6474, "step": 664 }, { "epoch": 0.3783783783783784, "grad_norm": 0.545051097869873, "learning_rate": 0.00010817446743035462, "loss": 1.459, "step": 665 }, { "epoch": 0.37894736842105264, "grad_norm": 0.5713635683059692, "learning_rate": 0.00010794788509371829, "loss": 1.44, "step": 666 }, { "epoch": 0.3795163584637269, "grad_norm": 0.5865978598594666, "learning_rate": 0.00010772126168610325, "loss": 1.5968, "step": 667 }, { "epoch": 0.38008534850640113, "grad_norm": 0.5625496506690979, "learning_rate": 0.00010749459837859408, "loss": 1.4018, "step": 668 }, { "epoch": 0.3806543385490754, "grad_norm": 0.5960560441017151, "learning_rate": 0.00010726789634248137, "loss": 1.5808, "step": 669 }, { "epoch": 0.3812233285917496, "grad_norm": 0.6137279868125916, "learning_rate": 0.00010704115674925604, "loss": 1.212, "step": 670 }, { "epoch": 0.3817923186344239, "grad_norm": 0.5478764772415161, "learning_rate": 0.00010681438077060291, "loss": 1.4701, "step": 671 }, { "epoch": 0.38236130867709817, "grad_norm": 0.6135146021842957, "learning_rate": 0.000106587569578395, "loss": 1.5428, "step": 672 }, { "epoch": 0.3829302987197724, "grad_norm": 0.5707561373710632, "learning_rate": 0.00010636072434468714, "loss": 1.5299, "step": 673 }, { "epoch": 0.38349928876244666, "grad_norm": 0.529769778251648, "learning_rate": 0.00010613384624171016, "loss": 1.4161, "step": 674 }, { "epoch": 0.3840682788051209, "grad_norm": 0.5672623515129089, "learning_rate": 0.00010590693644186474, "loss": 1.5084, "step": 675 }, { "epoch": 0.38463726884779514, "grad_norm": 0.5277720093727112, "learning_rate": 0.00010567999611771528, "loss": 1.2255, "step": 676 }, { "epoch": 0.38520625889046944, "grad_norm": 0.5478918552398682, "learning_rate": 0.00010545302644198405, "loss": 1.3878, "step": 677 }, { "epoch": 0.3857752489331437, "grad_norm": 0.5412498712539673, "learning_rate": 0.00010522602858754487, "loss": 1.5586, "step": 678 }, { "epoch": 0.38634423897581793, "grad_norm": 0.5770754814147949, "learning_rate": 0.00010499900372741718, "loss": 1.3127, "step": 679 }, { "epoch": 0.3869132290184922, "grad_norm": 0.5917402505874634, "learning_rate": 0.00010477195303476011, "loss": 1.3799, "step": 680 }, { "epoch": 0.3874822190611664, "grad_norm": 0.5400240421295166, "learning_rate": 0.00010454487768286612, "loss": 1.2999, "step": 681 }, { "epoch": 0.38805120910384067, "grad_norm": 0.5468504428863525, "learning_rate": 0.00010431777884515514, "loss": 1.3114, "step": 682 }, { "epoch": 0.3886201991465149, "grad_norm": 0.5608039498329163, "learning_rate": 0.00010409065769516856, "loss": 1.3888, "step": 683 }, { "epoch": 0.3891891891891892, "grad_norm": 0.5961167216300964, "learning_rate": 0.00010386351540656292, "loss": 1.5431, "step": 684 }, { "epoch": 0.38975817923186346, "grad_norm": 0.5718376040458679, "learning_rate": 0.00010363635315310414, "loss": 1.521, "step": 685 }, { "epoch": 0.3903271692745377, "grad_norm": 0.5798651576042175, "learning_rate": 0.00010340917210866118, "loss": 1.519, "step": 686 }, { "epoch": 0.39089615931721194, "grad_norm": 0.5611982941627502, "learning_rate": 0.00010318197344720018, "loss": 1.499, "step": 687 }, { "epoch": 0.3914651493598862, "grad_norm": 0.571074366569519, "learning_rate": 0.00010295475834277831, "loss": 1.4738, "step": 688 }, { "epoch": 0.39203413940256043, "grad_norm": 0.5722329020500183, "learning_rate": 0.00010272752796953766, "loss": 1.6584, "step": 689 }, { "epoch": 0.39260312944523473, "grad_norm": 0.5674881935119629, "learning_rate": 0.00010250028350169931, "loss": 1.5507, "step": 690 }, { "epoch": 0.393172119487909, "grad_norm": 0.5546680688858032, "learning_rate": 0.00010227302611355712, "loss": 1.297, "step": 691 }, { "epoch": 0.3937411095305832, "grad_norm": 0.5614904165267944, "learning_rate": 0.00010204575697947168, "loss": 1.4416, "step": 692 }, { "epoch": 0.39431009957325747, "grad_norm": 0.5829195380210876, "learning_rate": 0.00010181847727386433, "loss": 1.5031, "step": 693 }, { "epoch": 0.3948790896159317, "grad_norm": 0.5744046568870544, "learning_rate": 0.00010159118817121105, "loss": 1.4576, "step": 694 }, { "epoch": 0.39544807965860596, "grad_norm": 0.572902262210846, "learning_rate": 0.00010136389084603637, "loss": 1.5078, "step": 695 }, { "epoch": 0.3960170697012802, "grad_norm": 0.5696277618408203, "learning_rate": 0.00010113658647290723, "loss": 1.4636, "step": 696 }, { "epoch": 0.3960170697012802, "eval_loss": 1.4791862964630127, "eval_runtime": 15.3322, "eval_samples_per_second": 48.265, "eval_steps_per_second": 24.132, "step": 696 }, { "epoch": 0.3965860597439545, "grad_norm": 0.5474138855934143, "learning_rate": 0.0001009092762264271, "loss": 1.4683, "step": 697 }, { "epoch": 0.39715504978662874, "grad_norm": 0.6160016059875488, "learning_rate": 0.00010068196128122975, "loss": 1.6705, "step": 698 }, { "epoch": 0.397724039829303, "grad_norm": 0.5745415687561035, "learning_rate": 0.00010045464281197327, "loss": 1.5104, "step": 699 }, { "epoch": 0.39829302987197723, "grad_norm": 0.5802525281906128, "learning_rate": 0.0001002273219933339, "loss": 1.4029, "step": 700 }, { "epoch": 0.3988620199146515, "grad_norm": 0.5592519044876099, "learning_rate": 0.0001, "loss": 1.6325, "step": 701 }, { "epoch": 0.3994310099573257, "grad_norm": 0.6051873564720154, "learning_rate": 9.977267800666613e-05, "loss": 1.688, "step": 702 }, { "epoch": 0.4, "grad_norm": 0.5836036205291748, "learning_rate": 9.954535718802675e-05, "loss": 1.3107, "step": 703 }, { "epoch": 0.40056899004267427, "grad_norm": 0.5733322501182556, "learning_rate": 9.931803871877028e-05, "loss": 1.7469, "step": 704 }, { "epoch": 0.4011379800853485, "grad_norm": 0.5718969106674194, "learning_rate": 9.909072377357294e-05, "loss": 1.3822, "step": 705 }, { "epoch": 0.40170697012802276, "grad_norm": 0.5877561569213867, "learning_rate": 9.88634135270928e-05, "loss": 1.6344, "step": 706 }, { "epoch": 0.402275960170697, "grad_norm": 0.5636436939239502, "learning_rate": 9.863610915396365e-05, "loss": 1.5552, "step": 707 }, { "epoch": 0.40284495021337124, "grad_norm": 0.5809296369552612, "learning_rate": 9.840881182878895e-05, "loss": 1.3633, "step": 708 }, { "epoch": 0.40341394025604554, "grad_norm": 0.5500168204307556, "learning_rate": 9.81815227261357e-05, "loss": 1.4063, "step": 709 }, { "epoch": 0.4039829302987198, "grad_norm": 0.5806904435157776, "learning_rate": 9.795424302052836e-05, "loss": 1.5629, "step": 710 }, { "epoch": 0.40455192034139403, "grad_norm": 0.5868257880210876, "learning_rate": 9.77269738864429e-05, "loss": 1.3655, "step": 711 }, { "epoch": 0.4051209103840683, "grad_norm": 0.5417432188987732, "learning_rate": 9.749971649830071e-05, "loss": 1.4914, "step": 712 }, { "epoch": 0.4056899004267425, "grad_norm": 0.6012546420097351, "learning_rate": 9.727247203046234e-05, "loss": 1.5365, "step": 713 }, { "epoch": 0.40625889046941677, "grad_norm": 0.5691578388214111, "learning_rate": 9.704524165722174e-05, "loss": 1.5959, "step": 714 }, { "epoch": 0.406827880512091, "grad_norm": 0.5487850904464722, "learning_rate": 9.681802655279986e-05, "loss": 1.4469, "step": 715 }, { "epoch": 0.4073968705547653, "grad_norm": 0.6280918121337891, "learning_rate": 9.659082789133884e-05, "loss": 1.338, "step": 716 }, { "epoch": 0.40796586059743956, "grad_norm": 0.5909377932548523, "learning_rate": 9.63636468468959e-05, "loss": 1.6272, "step": 717 }, { "epoch": 0.4085348506401138, "grad_norm": 0.6044595837593079, "learning_rate": 9.613648459343708e-05, "loss": 1.5717, "step": 718 }, { "epoch": 0.40910384068278804, "grad_norm": 0.5833640098571777, "learning_rate": 9.590934230483149e-05, "loss": 1.4213, "step": 719 }, { "epoch": 0.4096728307254623, "grad_norm": 0.6057854890823364, "learning_rate": 9.568222115484488e-05, "loss": 1.4861, "step": 720 }, { "epoch": 0.41024182076813653, "grad_norm": 0.5813032984733582, "learning_rate": 9.54551223171339e-05, "loss": 1.5329, "step": 721 }, { "epoch": 0.41081081081081083, "grad_norm": 0.5498741865158081, "learning_rate": 9.522804696523991e-05, "loss": 1.4457, "step": 722 }, { "epoch": 0.4113798008534851, "grad_norm": 0.5357645750045776, "learning_rate": 9.500099627258282e-05, "loss": 1.2792, "step": 723 }, { "epoch": 0.4119487908961593, "grad_norm": 0.5478993654251099, "learning_rate": 9.477397141245519e-05, "loss": 1.5071, "step": 724 }, { "epoch": 0.41251778093883357, "grad_norm": 0.5776642560958862, "learning_rate": 9.454697355801598e-05, "loss": 1.3664, "step": 725 }, { "epoch": 0.4130867709815078, "grad_norm": 0.6283994913101196, "learning_rate": 9.432000388228473e-05, "loss": 1.3994, "step": 726 }, { "epoch": 0.41365576102418206, "grad_norm": 0.6153956651687622, "learning_rate": 9.409306355813529e-05, "loss": 1.2524, "step": 727 }, { "epoch": 0.41422475106685636, "grad_norm": 0.5952728986740112, "learning_rate": 9.386615375828984e-05, "loss": 1.5941, "step": 728 }, { "epoch": 0.4147937411095306, "grad_norm": 0.5799689292907715, "learning_rate": 9.36392756553129e-05, "loss": 1.3113, "step": 729 }, { "epoch": 0.41536273115220484, "grad_norm": 0.5933107733726501, "learning_rate": 9.341243042160503e-05, "loss": 1.6378, "step": 730 }, { "epoch": 0.4159317211948791, "grad_norm": 0.5808780789375305, "learning_rate": 9.318561922939711e-05, "loss": 1.663, "step": 731 }, { "epoch": 0.41650071123755333, "grad_norm": 0.5661304593086243, "learning_rate": 9.295884325074398e-05, "loss": 1.4145, "step": 732 }, { "epoch": 0.4170697012802276, "grad_norm": 0.577038049697876, "learning_rate": 9.273210365751862e-05, "loss": 1.4288, "step": 733 }, { "epoch": 0.4176386913229018, "grad_norm": 0.5904839038848877, "learning_rate": 9.250540162140597e-05, "loss": 1.5257, "step": 734 }, { "epoch": 0.4182076813655761, "grad_norm": 0.5645294785499573, "learning_rate": 9.227873831389677e-05, "loss": 1.4073, "step": 735 }, { "epoch": 0.41877667140825037, "grad_norm": 0.5541549921035767, "learning_rate": 9.205211490628173e-05, "loss": 1.3965, "step": 736 }, { "epoch": 0.4193456614509246, "grad_norm": 0.6137387752532959, "learning_rate": 9.18255325696454e-05, "loss": 1.3849, "step": 737 }, { "epoch": 0.41991465149359886, "grad_norm": 0.588316798210144, "learning_rate": 9.159899247486004e-05, "loss": 1.4989, "step": 738 }, { "epoch": 0.4204836415362731, "grad_norm": 0.567848265171051, "learning_rate": 9.13724957925797e-05, "loss": 1.4989, "step": 739 }, { "epoch": 0.42105263157894735, "grad_norm": 0.554695188999176, "learning_rate": 9.114604369323395e-05, "loss": 1.4509, "step": 740 }, { "epoch": 0.42162162162162165, "grad_norm": 0.5507339835166931, "learning_rate": 9.091963734702208e-05, "loss": 1.364, "step": 741 }, { "epoch": 0.4221906116642959, "grad_norm": 0.569786012172699, "learning_rate": 9.069327792390695e-05, "loss": 1.5775, "step": 742 }, { "epoch": 0.42275960170697013, "grad_norm": 0.563234806060791, "learning_rate": 9.046696659360894e-05, "loss": 1.4557, "step": 743 }, { "epoch": 0.4233285917496444, "grad_norm": 0.5537723302841187, "learning_rate": 9.024070452559986e-05, "loss": 1.443, "step": 744 }, { "epoch": 0.4238975817923186, "grad_norm": 0.5786699056625366, "learning_rate": 9.001449288909702e-05, "loss": 1.2683, "step": 745 }, { "epoch": 0.42446657183499287, "grad_norm": 0.55182945728302, "learning_rate": 8.978833285305705e-05, "loss": 1.4565, "step": 746 }, { "epoch": 0.42503556187766717, "grad_norm": 0.5818150043487549, "learning_rate": 8.956222558616998e-05, "loss": 1.6502, "step": 747 }, { "epoch": 0.4256045519203414, "grad_norm": 0.6044638752937317, "learning_rate": 8.933617225685319e-05, "loss": 1.4631, "step": 748 }, { "epoch": 0.42617354196301566, "grad_norm": 0.568188488483429, "learning_rate": 8.91101740332453e-05, "loss": 1.5997, "step": 749 }, { "epoch": 0.4267425320056899, "grad_norm": 0.5530648231506348, "learning_rate": 8.888423208320008e-05, "loss": 1.174, "step": 750 }, { "epoch": 0.42731152204836415, "grad_norm": 0.5782289505004883, "learning_rate": 8.865834757428064e-05, "loss": 1.5198, "step": 751 }, { "epoch": 0.4278805120910384, "grad_norm": 0.5685307383537292, "learning_rate": 8.843252167375322e-05, "loss": 1.5545, "step": 752 }, { "epoch": 0.42844950213371263, "grad_norm": 0.5832937359809875, "learning_rate": 8.820675554858115e-05, "loss": 1.5776, "step": 753 }, { "epoch": 0.42901849217638693, "grad_norm": 0.6279184818267822, "learning_rate": 8.7981050365419e-05, "loss": 1.5975, "step": 754 }, { "epoch": 0.4295874822190612, "grad_norm": 0.5440697073936462, "learning_rate": 8.775540729060618e-05, "loss": 1.3772, "step": 755 }, { "epoch": 0.4301564722617354, "grad_norm": 0.6341460347175598, "learning_rate": 8.752982749016139e-05, "loss": 1.573, "step": 756 }, { "epoch": 0.43072546230440967, "grad_norm": 0.5840321779251099, "learning_rate": 8.730431212977625e-05, "loss": 1.5051, "step": 757 }, { "epoch": 0.4312944523470839, "grad_norm": 0.5965592265129089, "learning_rate": 8.70788623748094e-05, "loss": 1.5323, "step": 758 }, { "epoch": 0.43186344238975816, "grad_norm": 0.5905702710151672, "learning_rate": 8.68534793902805e-05, "loss": 1.4051, "step": 759 }, { "epoch": 0.43243243243243246, "grad_norm": 0.5640906691551208, "learning_rate": 8.662816434086404e-05, "loss": 1.6614, "step": 760 }, { "epoch": 0.4330014224751067, "grad_norm": 0.5574825406074524, "learning_rate": 8.64029183908836e-05, "loss": 1.3464, "step": 761 }, { "epoch": 0.43357041251778095, "grad_norm": 0.5866842865943909, "learning_rate": 8.617774270430566e-05, "loss": 1.4531, "step": 762 }, { "epoch": 0.4341394025604552, "grad_norm": 0.6260978579521179, "learning_rate": 8.595263844473353e-05, "loss": 1.4005, "step": 763 }, { "epoch": 0.43470839260312943, "grad_norm": 0.5732872486114502, "learning_rate": 8.572760677540154e-05, "loss": 1.366, "step": 764 }, { "epoch": 0.4352773826458037, "grad_norm": 0.5682139992713928, "learning_rate": 8.550264885916877e-05, "loss": 1.359, "step": 765 }, { "epoch": 0.435846372688478, "grad_norm": 0.5898922085762024, "learning_rate": 8.527776585851328e-05, "loss": 1.5197, "step": 766 }, { "epoch": 0.4364153627311522, "grad_norm": 0.5902604460716248, "learning_rate": 8.505295893552594e-05, "loss": 1.4844, "step": 767 }, { "epoch": 0.43698435277382647, "grad_norm": 0.6057772040367126, "learning_rate": 8.482822925190452e-05, "loss": 1.5739, "step": 768 }, { "epoch": 0.4375533428165007, "grad_norm": 0.5546793341636658, "learning_rate": 8.460357796894773e-05, "loss": 1.4748, "step": 769 }, { "epoch": 0.43812233285917496, "grad_norm": 0.5493602156639099, "learning_rate": 8.437900624754895e-05, "loss": 1.3922, "step": 770 }, { "epoch": 0.4386913229018492, "grad_norm": 0.5499581098556519, "learning_rate": 8.415451524819058e-05, "loss": 1.574, "step": 771 }, { "epoch": 0.43926031294452345, "grad_norm": 0.5515440702438354, "learning_rate": 8.393010613093781e-05, "loss": 1.3672, "step": 772 }, { "epoch": 0.43982930298719775, "grad_norm": 0.5613058805465698, "learning_rate": 8.370578005543278e-05, "loss": 1.3815, "step": 773 }, { "epoch": 0.440398293029872, "grad_norm": 0.5643707513809204, "learning_rate": 8.348153818088844e-05, "loss": 1.5947, "step": 774 }, { "epoch": 0.44096728307254623, "grad_norm": 0.6310828924179077, "learning_rate": 8.325738166608263e-05, "loss": 1.5413, "step": 775 }, { "epoch": 0.4415362731152205, "grad_norm": 0.6655511856079102, "learning_rate": 8.303331166935209e-05, "loss": 1.5198, "step": 776 }, { "epoch": 0.4421052631578947, "grad_norm": 0.5539633631706238, "learning_rate": 8.280932934858652e-05, "loss": 1.4308, "step": 777 }, { "epoch": 0.44267425320056897, "grad_norm": 0.5974248647689819, "learning_rate": 8.25854358612225e-05, "loss": 1.537, "step": 778 }, { "epoch": 0.44324324324324327, "grad_norm": 0.5987525582313538, "learning_rate": 8.236163236423767e-05, "loss": 1.5318, "step": 779 }, { "epoch": 0.4438122332859175, "grad_norm": 0.5623188018798828, "learning_rate": 8.213792001414445e-05, "loss": 1.6016, "step": 780 }, { "epoch": 0.44438122332859176, "grad_norm": 0.5642153024673462, "learning_rate": 8.191429996698436e-05, "loss": 1.4452, "step": 781 }, { "epoch": 0.444950213371266, "grad_norm": 0.6042040586471558, "learning_rate": 8.1690773378322e-05, "loss": 1.6325, "step": 782 }, { "epoch": 0.44551920341394025, "grad_norm": 0.5777531862258911, "learning_rate": 8.146734140323896e-05, "loss": 1.6388, "step": 783 }, { "epoch": 0.4460881934566145, "grad_norm": 0.5600481629371643, "learning_rate": 8.124400519632788e-05, "loss": 1.5077, "step": 784 }, { "epoch": 0.4466571834992888, "grad_norm": 0.5644223690032959, "learning_rate": 8.102076591168655e-05, "loss": 1.4056, "step": 785 }, { "epoch": 0.44722617354196303, "grad_norm": 0.6023853421211243, "learning_rate": 8.079762470291191e-05, "loss": 1.713, "step": 786 }, { "epoch": 0.4477951635846373, "grad_norm": 0.5626102685928345, "learning_rate": 8.05745827230941e-05, "loss": 1.5125, "step": 787 }, { "epoch": 0.4483641536273115, "grad_norm": 0.5824998617172241, "learning_rate": 8.035164112481048e-05, "loss": 1.4695, "step": 788 }, { "epoch": 0.44893314366998577, "grad_norm": 0.5714951157569885, "learning_rate": 8.01288010601197e-05, "loss": 1.4452, "step": 789 }, { "epoch": 0.44950213371266, "grad_norm": 0.5934897065162659, "learning_rate": 7.990606368055564e-05, "loss": 1.5389, "step": 790 }, { "epoch": 0.45007112375533426, "grad_norm": 0.5794687867164612, "learning_rate": 7.968343013712167e-05, "loss": 1.4127, "step": 791 }, { "epoch": 0.45064011379800856, "grad_norm": 0.5628656148910522, "learning_rate": 7.946090158028455e-05, "loss": 1.4798, "step": 792 }, { "epoch": 0.4512091038406828, "grad_norm": 0.5794563293457031, "learning_rate": 7.923847915996851e-05, "loss": 1.5584, "step": 793 }, { "epoch": 0.45177809388335705, "grad_norm": 0.5685121417045593, "learning_rate": 7.901616402554933e-05, "loss": 1.51, "step": 794 }, { "epoch": 0.4523470839260313, "grad_norm": 0.568209171295166, "learning_rate": 7.87939573258483e-05, "loss": 1.5588, "step": 795 }, { "epoch": 0.45291607396870553, "grad_norm": 0.5683977603912354, "learning_rate": 7.857186020912647e-05, "loss": 1.4482, "step": 796 }, { "epoch": 0.4534850640113798, "grad_norm": 0.5802903771400452, "learning_rate": 7.834987382307861e-05, "loss": 1.5827, "step": 797 }, { "epoch": 0.4540540540540541, "grad_norm": 0.5780924558639526, "learning_rate": 7.812799931482721e-05, "loss": 1.4595, "step": 798 }, { "epoch": 0.4546230440967283, "grad_norm": 0.5929847359657288, "learning_rate": 7.790623783091677e-05, "loss": 1.5512, "step": 799 }, { "epoch": 0.45519203413940257, "grad_norm": 0.5519236326217651, "learning_rate": 7.768459051730752e-05, "loss": 1.4239, "step": 800 }, { "epoch": 0.4557610241820768, "grad_norm": 0.5426004528999329, "learning_rate": 7.74630585193699e-05, "loss": 1.3005, "step": 801 }, { "epoch": 0.45633001422475106, "grad_norm": 0.6065943241119385, "learning_rate": 7.724164298187838e-05, "loss": 1.3966, "step": 802 }, { "epoch": 0.4568990042674253, "grad_norm": 0.5971605777740479, "learning_rate": 7.70203450490056e-05, "loss": 1.5944, "step": 803 }, { "epoch": 0.45746799431009955, "grad_norm": 0.5548596978187561, "learning_rate": 7.679916586431654e-05, "loss": 1.4323, "step": 804 }, { "epoch": 0.45803698435277385, "grad_norm": 0.5478107929229736, "learning_rate": 7.657810657076243e-05, "loss": 1.3819, "step": 805 }, { "epoch": 0.4586059743954481, "grad_norm": 0.5837447047233582, "learning_rate": 7.635716831067505e-05, "loss": 1.3941, "step": 806 }, { "epoch": 0.45917496443812233, "grad_norm": 0.5920546650886536, "learning_rate": 7.613635222576072e-05, "loss": 1.5395, "step": 807 }, { "epoch": 0.4597439544807966, "grad_norm": 0.6047683358192444, "learning_rate": 7.59156594570944e-05, "loss": 1.4169, "step": 808 }, { "epoch": 0.4603129445234708, "grad_norm": 0.5774646401405334, "learning_rate": 7.569509114511386e-05, "loss": 1.5108, "step": 809 }, { "epoch": 0.46088193456614507, "grad_norm": 0.5855366587638855, "learning_rate": 7.547464842961362e-05, "loss": 1.6545, "step": 810 }, { "epoch": 0.46145092460881937, "grad_norm": 0.5752539038658142, "learning_rate": 7.52543324497393e-05, "loss": 1.6431, "step": 811 }, { "epoch": 0.4620199146514936, "grad_norm": 0.5689989328384399, "learning_rate": 7.503414434398151e-05, "loss": 1.2883, "step": 812 }, { "epoch": 0.46258890469416786, "grad_norm": 0.6341901421546936, "learning_rate": 7.481408525017013e-05, "loss": 1.4223, "step": 813 }, { "epoch": 0.4631578947368421, "grad_norm": 0.6005092263221741, "learning_rate": 7.459415630546842e-05, "loss": 1.5522, "step": 814 }, { "epoch": 0.46372688477951635, "grad_norm": 0.6249240636825562, "learning_rate": 7.437435864636691e-05, "loss": 1.5459, "step": 815 }, { "epoch": 0.4642958748221906, "grad_norm": 0.5745651125907898, "learning_rate": 7.415469340867787e-05, "loss": 1.6287, "step": 816 }, { "epoch": 0.4648648648648649, "grad_norm": 0.5915263891220093, "learning_rate": 7.393516172752919e-05, "loss": 1.4738, "step": 817 }, { "epoch": 0.46543385490753914, "grad_norm": 0.5895527601242065, "learning_rate": 7.371576473735867e-05, "loss": 1.6939, "step": 818 }, { "epoch": 0.4660028449502134, "grad_norm": 0.5770692825317383, "learning_rate": 7.349650357190807e-05, "loss": 1.4264, "step": 819 }, { "epoch": 0.4665718349928876, "grad_norm": 0.6085241436958313, "learning_rate": 7.327737936421721e-05, "loss": 1.5019, "step": 820 }, { "epoch": 0.46714082503556187, "grad_norm": 0.5652032494544983, "learning_rate": 7.305839324661823e-05, "loss": 1.3324, "step": 821 }, { "epoch": 0.4677098150782361, "grad_norm": 0.5609267950057983, "learning_rate": 7.283954635072968e-05, "loss": 1.3902, "step": 822 }, { "epoch": 0.46827880512091036, "grad_norm": 0.5592348575592041, "learning_rate": 7.262083980745069e-05, "loss": 1.4362, "step": 823 }, { "epoch": 0.46884779516358466, "grad_norm": 0.5790618658065796, "learning_rate": 7.240227474695509e-05, "loss": 1.4753, "step": 824 }, { "epoch": 0.4694167852062589, "grad_norm": 0.5804809927940369, "learning_rate": 7.218385229868559e-05, "loss": 1.2719, "step": 825 }, { "epoch": 0.46998577524893315, "grad_norm": 0.5487887859344482, "learning_rate": 7.196557359134794e-05, "loss": 1.3212, "step": 826 }, { "epoch": 0.4705547652916074, "grad_norm": 0.5842025876045227, "learning_rate": 7.174743975290513e-05, "loss": 1.5622, "step": 827 }, { "epoch": 0.47112375533428164, "grad_norm": 0.580644428730011, "learning_rate": 7.152945191057154e-05, "loss": 1.4567, "step": 828 }, { "epoch": 0.4716927453769559, "grad_norm": 0.5735095739364624, "learning_rate": 7.131161119080712e-05, "loss": 1.4547, "step": 829 }, { "epoch": 0.4722617354196302, "grad_norm": 0.5592243671417236, "learning_rate": 7.109391871931142e-05, "loss": 1.3144, "step": 830 }, { "epoch": 0.4728307254623044, "grad_norm": 0.581495463848114, "learning_rate": 7.087637562101813e-05, "loss": 1.5145, "step": 831 }, { "epoch": 0.47339971550497867, "grad_norm": 0.5653107762336731, "learning_rate": 7.065898302008886e-05, "loss": 1.388, "step": 832 }, { "epoch": 0.4739687055476529, "grad_norm": 0.5776169300079346, "learning_rate": 7.04417420399077e-05, "loss": 1.5059, "step": 833 }, { "epoch": 0.47453769559032716, "grad_norm": 0.556419312953949, "learning_rate": 7.02246538030751e-05, "loss": 1.3933, "step": 834 }, { "epoch": 0.4751066856330014, "grad_norm": 0.5605750679969788, "learning_rate": 7.000771943140218e-05, "loss": 1.4677, "step": 835 }, { "epoch": 0.4756756756756757, "grad_norm": 0.5609278678894043, "learning_rate": 6.979094004590507e-05, "loss": 1.4526, "step": 836 }, { "epoch": 0.47624466571834995, "grad_norm": 0.5990177392959595, "learning_rate": 6.957431676679896e-05, "loss": 1.6215, "step": 837 }, { "epoch": 0.4768136557610242, "grad_norm": 0.5737520456314087, "learning_rate": 6.935785071349228e-05, "loss": 1.3985, "step": 838 }, { "epoch": 0.47738264580369844, "grad_norm": 0.5521170496940613, "learning_rate": 6.914154300458115e-05, "loss": 1.6527, "step": 839 }, { "epoch": 0.4779516358463727, "grad_norm": 0.5809024572372437, "learning_rate": 6.892539475784326e-05, "loss": 1.5697, "step": 840 }, { "epoch": 0.4785206258890469, "grad_norm": 0.6158897876739502, "learning_rate": 6.870940709023237e-05, "loss": 1.48, "step": 841 }, { "epoch": 0.47908961593172117, "grad_norm": 0.5950735807418823, "learning_rate": 6.849358111787246e-05, "loss": 1.3335, "step": 842 }, { "epoch": 0.47965860597439547, "grad_norm": 0.5788929462432861, "learning_rate": 6.82779179560519e-05, "loss": 1.4746, "step": 843 }, { "epoch": 0.4802275960170697, "grad_norm": 0.6169467568397522, "learning_rate": 6.806241871921777e-05, "loss": 1.2997, "step": 844 }, { "epoch": 0.48079658605974396, "grad_norm": 0.5850261449813843, "learning_rate": 6.784708452096998e-05, "loss": 1.2293, "step": 845 }, { "epoch": 0.4813655761024182, "grad_norm": 0.5514947772026062, "learning_rate": 6.763191647405568e-05, "loss": 1.3825, "step": 846 }, { "epoch": 0.48193456614509245, "grad_norm": 0.5753430128097534, "learning_rate": 6.741691569036338e-05, "loss": 1.5195, "step": 847 }, { "epoch": 0.4825035561877667, "grad_norm": 0.5876197814941406, "learning_rate": 6.720208328091732e-05, "loss": 1.4453, "step": 848 }, { "epoch": 0.483072546230441, "grad_norm": 0.5744032859802246, "learning_rate": 6.69874203558716e-05, "loss": 1.4914, "step": 849 }, { "epoch": 0.48364153627311524, "grad_norm": 0.5800637006759644, "learning_rate": 6.677292802450447e-05, "loss": 1.4932, "step": 850 }, { "epoch": 0.4842105263157895, "grad_norm": 0.5554024577140808, "learning_rate": 6.655860739521271e-05, "loss": 1.1795, "step": 851 }, { "epoch": 0.4847795163584637, "grad_norm": 0.5711913704872131, "learning_rate": 6.634445957550577e-05, "loss": 1.486, "step": 852 }, { "epoch": 0.48534850640113797, "grad_norm": 0.5684107542037964, "learning_rate": 6.613048567200013e-05, "loss": 1.3984, "step": 853 }, { "epoch": 0.4859174964438122, "grad_norm": 0.5672001242637634, "learning_rate": 6.591668679041359e-05, "loss": 1.4811, "step": 854 }, { "epoch": 0.4864864864864865, "grad_norm": 0.5804989337921143, "learning_rate": 6.570306403555937e-05, "loss": 1.3624, "step": 855 }, { "epoch": 0.48705547652916076, "grad_norm": 0.6067745089530945, "learning_rate": 6.548961851134072e-05, "loss": 1.4192, "step": 856 }, { "epoch": 0.487624466571835, "grad_norm": 0.576329231262207, "learning_rate": 6.527635132074493e-05, "loss": 1.6314, "step": 857 }, { "epoch": 0.48819345661450925, "grad_norm": 0.5863393545150757, "learning_rate": 6.506326356583781e-05, "loss": 1.5669, "step": 858 }, { "epoch": 0.4887624466571835, "grad_norm": 0.6074771285057068, "learning_rate": 6.485035634775796e-05, "loss": 1.3334, "step": 859 }, { "epoch": 0.48933143669985774, "grad_norm": 0.5837851166725159, "learning_rate": 6.463763076671091e-05, "loss": 1.607, "step": 860 }, { "epoch": 0.489900426742532, "grad_norm": 0.5989742875099182, "learning_rate": 6.442508792196369e-05, "loss": 1.4518, "step": 861 }, { "epoch": 0.4904694167852063, "grad_norm": 0.5692201852798462, "learning_rate": 6.4212728911839e-05, "loss": 1.3878, "step": 862 }, { "epoch": 0.4910384068278805, "grad_norm": 0.6134719252586365, "learning_rate": 6.400055483370957e-05, "loss": 1.5154, "step": 863 }, { "epoch": 0.49160739687055477, "grad_norm": 0.5494038462638855, "learning_rate": 6.378856678399255e-05, "loss": 1.2968, "step": 864 }, { "epoch": 0.492176386913229, "grad_norm": 0.5780492424964905, "learning_rate": 6.357676585814366e-05, "loss": 1.5766, "step": 865 }, { "epoch": 0.49274537695590326, "grad_norm": 0.5398704409599304, "learning_rate": 6.336515315065168e-05, "loss": 1.4446, "step": 866 }, { "epoch": 0.4933143669985775, "grad_norm": 0.5509852170944214, "learning_rate": 6.315372975503285e-05, "loss": 1.4465, "step": 867 }, { "epoch": 0.4938833570412518, "grad_norm": 0.6671035885810852, "learning_rate": 6.294249676382508e-05, "loss": 1.706, "step": 868 }, { "epoch": 0.49445234708392605, "grad_norm": 0.579408586025238, "learning_rate": 6.273145526858236e-05, "loss": 1.5695, "step": 869 }, { "epoch": 0.4950213371266003, "grad_norm": 0.571058988571167, "learning_rate": 6.252060635986911e-05, "loss": 1.3541, "step": 870 }, { "epoch": 0.49559032716927454, "grad_norm": 0.5792422890663147, "learning_rate": 6.230995112725454e-05, "loss": 1.4329, "step": 871 }, { "epoch": 0.4961593172119488, "grad_norm": 0.5893927216529846, "learning_rate": 6.209949065930706e-05, "loss": 1.4674, "step": 872 }, { "epoch": 0.496728307254623, "grad_norm": 0.5954142212867737, "learning_rate": 6.188922604358865e-05, "loss": 1.4462, "step": 873 }, { "epoch": 0.4972972972972973, "grad_norm": 0.6741952896118164, "learning_rate": 6.16791583666492e-05, "loss": 1.6458, "step": 874 }, { "epoch": 0.49786628733997157, "grad_norm": 0.6125763654708862, "learning_rate": 6.146928871402081e-05, "loss": 1.5387, "step": 875 }, { "epoch": 0.4984352773826458, "grad_norm": 0.5839952230453491, "learning_rate": 6.12596181702124e-05, "loss": 1.6821, "step": 876 }, { "epoch": 0.49900426742532006, "grad_norm": 0.5859706401824951, "learning_rate": 6.1050147818704e-05, "loss": 1.4713, "step": 877 }, { "epoch": 0.4995732574679943, "grad_norm": 0.5910811424255371, "learning_rate": 6.0840878741941057e-05, "loss": 1.59, "step": 878 }, { "epoch": 0.5001422475106686, "grad_norm": 0.6297405958175659, "learning_rate": 6.063181202132901e-05, "loss": 1.5881, "step": 879 }, { "epoch": 0.5007112375533428, "grad_norm": 0.5714183449745178, "learning_rate": 6.0422948737227504e-05, "loss": 1.5894, "step": 880 }, { "epoch": 0.5012802275960171, "grad_norm": 0.5969492197036743, "learning_rate": 6.0214289968945004e-05, "loss": 1.6697, "step": 881 }, { "epoch": 0.5018492176386913, "grad_norm": 0.5817530155181885, "learning_rate": 6.000583679473315e-05, "loss": 1.5806, "step": 882 }, { "epoch": 0.5024182076813656, "grad_norm": 0.5869944095611572, "learning_rate": 5.979759029178107e-05, "loss": 1.4565, "step": 883 }, { "epoch": 0.5029871977240399, "grad_norm": 0.5745888948440552, "learning_rate": 5.958955153621004e-05, "loss": 1.5645, "step": 884 }, { "epoch": 0.5035561877667141, "grad_norm": 0.549628734588623, "learning_rate": 5.938172160306765e-05, "loss": 1.5017, "step": 885 }, { "epoch": 0.5041251778093884, "grad_norm": 0.5471094250679016, "learning_rate": 5.9174101566322504e-05, "loss": 1.2781, "step": 886 }, { "epoch": 0.5046941678520626, "grad_norm": 0.5772054195404053, "learning_rate": 5.896669249885851e-05, "loss": 1.386, "step": 887 }, { "epoch": 0.5052631578947369, "grad_norm": 0.6143761873245239, "learning_rate": 5.875949547246939e-05, "loss": 1.5432, "step": 888 }, { "epoch": 0.505832147937411, "grad_norm": 0.5768917202949524, "learning_rate": 5.8552511557853204e-05, "loss": 1.6945, "step": 889 }, { "epoch": 0.5064011379800853, "grad_norm": 0.5644556283950806, "learning_rate": 5.8345741824606617e-05, "loss": 1.5163, "step": 890 }, { "epoch": 0.5069701280227596, "grad_norm": 0.6083329916000366, "learning_rate": 5.813918734121955e-05, "loss": 1.7979, "step": 891 }, { "epoch": 0.5075391180654338, "grad_norm": 0.5543102025985718, "learning_rate": 5.7932849175069705e-05, "loss": 1.5558, "step": 892 }, { "epoch": 0.5081081081081081, "grad_norm": 0.6090741753578186, "learning_rate": 5.7726728392416874e-05, "loss": 1.6233, "step": 893 }, { "epoch": 0.5086770981507823, "grad_norm": 0.556496798992157, "learning_rate": 5.7520826058397525e-05, "loss": 1.5755, "step": 894 }, { "epoch": 0.5092460881934566, "grad_norm": 0.6258504986763, "learning_rate": 5.731514323701927e-05, "loss": 1.6054, "step": 895 }, { "epoch": 0.5098150782361308, "grad_norm": 0.6283307671546936, "learning_rate": 5.7109680991155364e-05, "loss": 1.8276, "step": 896 }, { "epoch": 0.5103840682788051, "grad_norm": 0.5817832946777344, "learning_rate": 5.690444038253935e-05, "loss": 1.6388, "step": 897 }, { "epoch": 0.5109530583214794, "grad_norm": 0.5892955660820007, "learning_rate": 5.669942247175933e-05, "loss": 1.2641, "step": 898 }, { "epoch": 0.5115220483641536, "grad_norm": 0.5834968686103821, "learning_rate": 5.649462831825265e-05, "loss": 1.4207, "step": 899 }, { "epoch": 0.5120910384068279, "grad_norm": 0.5753495693206787, "learning_rate": 5.629005898030035e-05, "loss": 1.4724, "step": 900 }, { "epoch": 0.5126600284495021, "grad_norm": 0.6050419211387634, "learning_rate": 5.608571551502175e-05, "loss": 1.7189, "step": 901 }, { "epoch": 0.5132290184921764, "grad_norm": 0.5946124196052551, "learning_rate": 5.588159897836902e-05, "loss": 1.3803, "step": 902 }, { "epoch": 0.5137980085348507, "grad_norm": 0.5731397867202759, "learning_rate": 5.56777104251216e-05, "loss": 1.7426, "step": 903 }, { "epoch": 0.5143669985775249, "grad_norm": 0.5813397169113159, "learning_rate": 5.5474050908880814e-05, "loss": 1.4898, "step": 904 }, { "epoch": 0.5149359886201992, "grad_norm": 0.5610973834991455, "learning_rate": 5.5270621482064465e-05, "loss": 1.4937, "step": 905 }, { "epoch": 0.5155049786628734, "grad_norm": 0.5550079941749573, "learning_rate": 5.50674231959013e-05, "loss": 1.3543, "step": 906 }, { "epoch": 0.5160739687055477, "grad_norm": 0.596593976020813, "learning_rate": 5.4864457100425783e-05, "loss": 1.5856, "step": 907 }, { "epoch": 0.5166429587482219, "grad_norm": 0.6018926501274109, "learning_rate": 5.4661724244472355e-05, "loss": 1.5092, "step": 908 }, { "epoch": 0.5172119487908962, "grad_norm": 0.6650524735450745, "learning_rate": 5.4459225675670264e-05, "loss": 1.7059, "step": 909 }, { "epoch": 0.5177809388335705, "grad_norm": 0.5858013033866882, "learning_rate": 5.425696244043807e-05, "loss": 1.4591, "step": 910 }, { "epoch": 0.5183499288762446, "grad_norm": 0.555473268032074, "learning_rate": 5.405493558397824e-05, "loss": 1.401, "step": 911 }, { "epoch": 0.518918918918919, "grad_norm": 0.6246885061264038, "learning_rate": 5.385314615027168e-05, "loss": 1.4415, "step": 912 }, { "epoch": 0.5194879089615931, "grad_norm": 0.608062207698822, "learning_rate": 5.365159518207252e-05, "loss": 1.4239, "step": 913 }, { "epoch": 0.5200568990042674, "grad_norm": 0.5979565382003784, "learning_rate": 5.345028372090256e-05, "loss": 1.4656, "step": 914 }, { "epoch": 0.5206258890469416, "grad_norm": 0.6553084254264832, "learning_rate": 5.324921280704589e-05, "loss": 1.4609, "step": 915 }, { "epoch": 0.5211948790896159, "grad_norm": 0.5839146971702576, "learning_rate": 5.304838347954363e-05, "loss": 1.5546, "step": 916 }, { "epoch": 0.5217638691322902, "grad_norm": 0.5618466734886169, "learning_rate": 5.284779677618841e-05, "loss": 1.4078, "step": 917 }, { "epoch": 0.5223328591749644, "grad_norm": 0.6020224690437317, "learning_rate": 5.264745373351923e-05, "loss": 1.568, "step": 918 }, { "epoch": 0.5229018492176387, "grad_norm": 0.6049513220787048, "learning_rate": 5.244735538681584e-05, "loss": 1.3196, "step": 919 }, { "epoch": 0.5234708392603129, "grad_norm": 0.5781171917915344, "learning_rate": 5.224750277009358e-05, "loss": 1.5366, "step": 920 }, { "epoch": 0.5240398293029872, "grad_norm": 0.6478269696235657, "learning_rate": 5.204789691609793e-05, "loss": 1.5281, "step": 921 }, { "epoch": 0.5246088193456615, "grad_norm": 0.598915696144104, "learning_rate": 5.184853885629921e-05, "loss": 1.5734, "step": 922 }, { "epoch": 0.5251778093883357, "grad_norm": 0.589694619178772, "learning_rate": 5.1649429620887334e-05, "loss": 1.4307, "step": 923 }, { "epoch": 0.52574679943101, "grad_norm": 0.5483283996582031, "learning_rate": 5.145057023876634e-05, "loss": 1.4334, "step": 924 }, { "epoch": 0.5263157894736842, "grad_norm": 0.5908382534980774, "learning_rate": 5.125196173754914e-05, "loss": 1.588, "step": 925 }, { "epoch": 0.5268847795163585, "grad_norm": 0.5898739695549011, "learning_rate": 5.105360514355222e-05, "loss": 1.5685, "step": 926 }, { "epoch": 0.5274537695590327, "grad_norm": 0.6079673171043396, "learning_rate": 5.0855501481790305e-05, "loss": 1.4421, "step": 927 }, { "epoch": 0.528022759601707, "grad_norm": 0.5824552178382874, "learning_rate": 5.0657651775971146e-05, "loss": 1.3472, "step": 928 }, { "epoch": 0.5285917496443813, "grad_norm": 0.5997583866119385, "learning_rate": 5.046005704849015e-05, "loss": 1.6292, "step": 929 }, { "epoch": 0.5291607396870555, "grad_norm": 0.5740709900856018, "learning_rate": 5.026271832042506e-05, "loss": 1.4085, "step": 930 }, { "epoch": 0.5297297297297298, "grad_norm": 0.5683955550193787, "learning_rate": 5.0065636611530767e-05, "loss": 1.4722, "step": 931 }, { "epoch": 0.530298719772404, "grad_norm": 0.5909097790718079, "learning_rate": 4.986881294023397e-05, "loss": 1.5688, "step": 932 }, { "epoch": 0.5308677098150782, "grad_norm": 0.5723986029624939, "learning_rate": 4.967224832362807e-05, "loss": 1.718, "step": 933 }, { "epoch": 0.5314366998577524, "grad_norm": 0.6397773623466492, "learning_rate": 4.947594377746769e-05, "loss": 1.5896, "step": 934 }, { "epoch": 0.5320056899004267, "grad_norm": 0.6130902171134949, "learning_rate": 4.9279900316163466e-05, "loss": 1.5974, "step": 935 }, { "epoch": 0.532574679943101, "grad_norm": 0.5888193845748901, "learning_rate": 4.908411895277704e-05, "loss": 1.569, "step": 936 }, { "epoch": 0.5331436699857752, "grad_norm": 0.5966805219650269, "learning_rate": 4.8888600699015496e-05, "loss": 1.4014, "step": 937 }, { "epoch": 0.5337126600284495, "grad_norm": 0.6131336092948914, "learning_rate": 4.869334656522644e-05, "loss": 1.5619, "step": 938 }, { "epoch": 0.5342816500711237, "grad_norm": 0.5846887826919556, "learning_rate": 4.849835756039254e-05, "loss": 1.5674, "step": 939 }, { "epoch": 0.534850640113798, "grad_norm": 0.5879199504852295, "learning_rate": 4.830363469212631e-05, "loss": 1.6148, "step": 940 }, { "epoch": 0.5354196301564723, "grad_norm": 0.6081675887107849, "learning_rate": 4.8109178966665194e-05, "loss": 1.5329, "step": 941 }, { "epoch": 0.5359886201991465, "grad_norm": 0.5982802510261536, "learning_rate": 4.791499138886603e-05, "loss": 1.5198, "step": 942 }, { "epoch": 0.5365576102418208, "grad_norm": 0.5899128913879395, "learning_rate": 4.7721072962199975e-05, "loss": 1.331, "step": 943 }, { "epoch": 0.537126600284495, "grad_norm": 0.6289139986038208, "learning_rate": 4.7527424688747535e-05, "loss": 1.3543, "step": 944 }, { "epoch": 0.5376955903271693, "grad_norm": 0.5747124552726746, "learning_rate": 4.733404756919287e-05, "loss": 1.2679, "step": 945 }, { "epoch": 0.5382645803698435, "grad_norm": 0.5888437032699585, "learning_rate": 4.7140942602819236e-05, "loss": 1.3506, "step": 946 }, { "epoch": 0.5388335704125178, "grad_norm": 0.6044580936431885, "learning_rate": 4.694811078750338e-05, "loss": 1.5955, "step": 947 }, { "epoch": 0.5394025604551921, "grad_norm": 0.6149877905845642, "learning_rate": 4.6755553119710524e-05, "loss": 1.5836, "step": 948 }, { "epoch": 0.5399715504978663, "grad_norm": 0.6135841012001038, "learning_rate": 4.656327059448937e-05, "loss": 1.4659, "step": 949 }, { "epoch": 0.5405405405405406, "grad_norm": 0.5868760943412781, "learning_rate": 4.637126420546653e-05, "loss": 1.3821, "step": 950 }, { "epoch": 0.5411095305832148, "grad_norm": 0.6109480261802673, "learning_rate": 4.6179534944842e-05, "loss": 1.5173, "step": 951 }, { "epoch": 0.5416785206258891, "grad_norm": 0.6133657693862915, "learning_rate": 4.5988083803383464e-05, "loss": 1.6325, "step": 952 }, { "epoch": 0.5422475106685632, "grad_norm": 0.593211829662323, "learning_rate": 4.57969117704215e-05, "loss": 1.361, "step": 953 }, { "epoch": 0.5428165007112375, "grad_norm": 0.5881854891777039, "learning_rate": 4.560601983384447e-05, "loss": 1.3796, "step": 954 }, { "epoch": 0.5433854907539118, "grad_norm": 0.65924471616745, "learning_rate": 4.5415408980093096e-05, "loss": 1.5899, "step": 955 }, { "epoch": 0.543954480796586, "grad_norm": 0.6201925277709961, "learning_rate": 4.522508019415587e-05, "loss": 1.536, "step": 956 }, { "epoch": 0.5445234708392603, "grad_norm": 0.5619149208068848, "learning_rate": 4.50350344595635e-05, "loss": 1.3624, "step": 957 }, { "epoch": 0.5450924608819345, "grad_norm": 0.5680489540100098, "learning_rate": 4.484527275838404e-05, "loss": 1.4247, "step": 958 }, { "epoch": 0.5456614509246088, "grad_norm": 0.5449238419532776, "learning_rate": 4.4655796071217937e-05, "loss": 1.3423, "step": 959 }, { "epoch": 0.5462304409672831, "grad_norm": 0.6032193899154663, "learning_rate": 4.446660537719256e-05, "loss": 1.6294, "step": 960 }, { "epoch": 0.5467994310099573, "grad_norm": 0.5516905784606934, "learning_rate": 4.427770165395766e-05, "loss": 1.3738, "step": 961 }, { "epoch": 0.5473684210526316, "grad_norm": 0.6235291361808777, "learning_rate": 4.4089085877679904e-05, "loss": 1.4602, "step": 962 }, { "epoch": 0.5479374110953058, "grad_norm": 0.6051345467567444, "learning_rate": 4.3900759023037974e-05, "loss": 1.3761, "step": 963 }, { "epoch": 0.5485064011379801, "grad_norm": 0.5858922600746155, "learning_rate": 4.3712722063217693e-05, "loss": 1.5158, "step": 964 }, { "epoch": 0.5490753911806543, "grad_norm": 0.5914279818534851, "learning_rate": 4.3524975969906636e-05, "loss": 1.3333, "step": 965 }, { "epoch": 0.5496443812233286, "grad_norm": 0.5849418044090271, "learning_rate": 4.3337521713289407e-05, "loss": 1.5459, "step": 966 }, { "epoch": 0.5502133712660029, "grad_norm": 0.5740037560462952, "learning_rate": 4.315036026204262e-05, "loss": 1.3858, "step": 967 }, { "epoch": 0.5507823613086771, "grad_norm": 0.5611101984977722, "learning_rate": 4.296349258332967e-05, "loss": 1.3895, "step": 968 }, { "epoch": 0.5513513513513514, "grad_norm": 0.585473895072937, "learning_rate": 4.277691964279594e-05, "loss": 1.2682, "step": 969 }, { "epoch": 0.5519203413940256, "grad_norm": 0.6113364100456238, "learning_rate": 4.259064240456374e-05, "loss": 1.4292, "step": 970 }, { "epoch": 0.5524893314366999, "grad_norm": 0.6335917115211487, "learning_rate": 4.2404661831227276e-05, "loss": 1.4529, "step": 971 }, { "epoch": 0.5530583214793741, "grad_norm": 0.574226975440979, "learning_rate": 4.2218978883847835e-05, "loss": 1.5254, "step": 972 }, { "epoch": 0.5536273115220484, "grad_norm": 0.5865671038627625, "learning_rate": 4.203359452194863e-05, "loss": 1.5265, "step": 973 }, { "epoch": 0.5541963015647227, "grad_norm": 0.5852011442184448, "learning_rate": 4.184850970350992e-05, "loss": 1.5834, "step": 974 }, { "epoch": 0.5547652916073968, "grad_norm": 0.6045235395431519, "learning_rate": 4.166372538496408e-05, "loss": 1.3905, "step": 975 }, { "epoch": 0.5553342816500711, "grad_norm": 0.558691143989563, "learning_rate": 4.147924252119063e-05, "loss": 1.5088, "step": 976 }, { "epoch": 0.5559032716927453, "grad_norm": 0.5623577237129211, "learning_rate": 4.129506206551138e-05, "loss": 1.3502, "step": 977 }, { "epoch": 0.5564722617354196, "grad_norm": 0.5946846604347229, "learning_rate": 4.1111184969685354e-05, "loss": 1.3884, "step": 978 }, { "epoch": 0.5570412517780939, "grad_norm": 0.5882412195205688, "learning_rate": 4.0927612183903976e-05, "loss": 1.542, "step": 979 }, { "epoch": 0.5576102418207681, "grad_norm": 0.577912449836731, "learning_rate": 4.0744344656786124e-05, "loss": 1.324, "step": 980 }, { "epoch": 0.5581792318634424, "grad_norm": 0.5644152164459229, "learning_rate": 4.056138333537326e-05, "loss": 1.2746, "step": 981 }, { "epoch": 0.5587482219061166, "grad_norm": 0.6058292984962463, "learning_rate": 4.037872916512455e-05, "loss": 1.5404, "step": 982 }, { "epoch": 0.5593172119487909, "grad_norm": 0.6061570644378662, "learning_rate": 4.019638308991189e-05, "loss": 1.3896, "step": 983 }, { "epoch": 0.5598862019914651, "grad_norm": 0.6102644205093384, "learning_rate": 4.0014346052015114e-05, "loss": 1.5365, "step": 984 }, { "epoch": 0.5604551920341394, "grad_norm": 0.5747568011283875, "learning_rate": 3.983261899211708e-05, "loss": 1.4337, "step": 985 }, { "epoch": 0.5610241820768137, "grad_norm": 0.5756990909576416, "learning_rate": 3.965120284929878e-05, "loss": 1.4752, "step": 986 }, { "epoch": 0.5615931721194879, "grad_norm": 0.570568323135376, "learning_rate": 3.947009856103465e-05, "loss": 1.4064, "step": 987 }, { "epoch": 0.5621621621621622, "grad_norm": 0.6102871298789978, "learning_rate": 3.928930706318752e-05, "loss": 1.5697, "step": 988 }, { "epoch": 0.5627311522048364, "grad_norm": 0.555619478225708, "learning_rate": 3.910882929000387e-05, "loss": 1.2905, "step": 989 }, { "epoch": 0.5633001422475107, "grad_norm": 0.6053213477134705, "learning_rate": 3.892866617410901e-05, "loss": 1.4823, "step": 990 }, { "epoch": 0.5638691322901849, "grad_norm": 0.5635027289390564, "learning_rate": 3.874881864650224e-05, "loss": 1.2325, "step": 991 }, { "epoch": 0.5644381223328592, "grad_norm": 0.6095726490020752, "learning_rate": 3.8569287636552024e-05, "loss": 1.5359, "step": 992 }, { "epoch": 0.5650071123755335, "grad_norm": 0.5644766092300415, "learning_rate": 3.839007407199129e-05, "loss": 1.277, "step": 993 }, { "epoch": 0.5655761024182077, "grad_norm": 0.5609472393989563, "learning_rate": 3.821117887891249e-05, "loss": 1.2394, "step": 994 }, { "epoch": 0.566145092460882, "grad_norm": 0.6164161562919617, "learning_rate": 3.803260298176288e-05, "loss": 1.5458, "step": 995 }, { "epoch": 0.5667140825035561, "grad_norm": 0.6040405631065369, "learning_rate": 3.7854347303339754e-05, "loss": 1.2356, "step": 996 }, { "epoch": 0.5672830725462304, "grad_norm": 0.6196702718734741, "learning_rate": 3.767641276478563e-05, "loss": 1.5923, "step": 997 }, { "epoch": 0.5678520625889047, "grad_norm": 0.5526005029678345, "learning_rate": 3.749880028558364e-05, "loss": 1.5057, "step": 998 }, { "epoch": 0.5684210526315789, "grad_norm": 0.5806797742843628, "learning_rate": 3.732151078355253e-05, "loss": 1.5355, "step": 999 }, { "epoch": 0.5689900426742532, "grad_norm": 0.5680354237556458, "learning_rate": 3.7144545174842115e-05, "loss": 1.4381, "step": 1000 }, { "epoch": 0.5695590327169274, "grad_norm": 0.5921180248260498, "learning_rate": 3.6967904373928475e-05, "loss": 1.3444, "step": 1001 }, { "epoch": 0.5701280227596017, "grad_norm": 0.5849342942237854, "learning_rate": 3.6791589293609184e-05, "loss": 1.3836, "step": 1002 }, { "epoch": 0.5706970128022759, "grad_norm": 0.5548643469810486, "learning_rate": 3.661560084499874e-05, "loss": 1.4809, "step": 1003 }, { "epoch": 0.5712660028449502, "grad_norm": 0.5976467132568359, "learning_rate": 3.64399399375237e-05, "loss": 1.4543, "step": 1004 }, { "epoch": 0.5718349928876245, "grad_norm": 0.588699996471405, "learning_rate": 3.6264607478918037e-05, "loss": 1.4448, "step": 1005 }, { "epoch": 0.5724039829302987, "grad_norm": 0.5786314606666565, "learning_rate": 3.608960437521844e-05, "loss": 1.769, "step": 1006 }, { "epoch": 0.572972972972973, "grad_norm": 0.6124690771102905, "learning_rate": 3.591493153075966e-05, "loss": 1.6527, "step": 1007 }, { "epoch": 0.5735419630156472, "grad_norm": 0.5587359070777893, "learning_rate": 3.5740589848169894e-05, "loss": 1.2819, "step": 1008 }, { "epoch": 0.5741109530583215, "grad_norm": 0.6170410513877869, "learning_rate": 3.556658022836594e-05, "loss": 1.5858, "step": 1009 }, { "epoch": 0.5746799431009957, "grad_norm": 0.5927881002426147, "learning_rate": 3.5392903570548694e-05, "loss": 1.6321, "step": 1010 }, { "epoch": 0.57524893314367, "grad_norm": 0.5902583599090576, "learning_rate": 3.521956077219847e-05, "loss": 1.5162, "step": 1011 }, { "epoch": 0.5758179231863443, "grad_norm": 0.6113704442977905, "learning_rate": 3.504655272907028e-05, "loss": 1.6929, "step": 1012 }, { "epoch": 0.5763869132290185, "grad_norm": 0.5586623549461365, "learning_rate": 3.4873880335189427e-05, "loss": 1.3555, "step": 1013 }, { "epoch": 0.5769559032716928, "grad_norm": 0.5992634296417236, "learning_rate": 3.470154448284659e-05, "loss": 1.6901, "step": 1014 }, { "epoch": 0.577524893314367, "grad_norm": 0.5722742676734924, "learning_rate": 3.452954606259343e-05, "loss": 1.386, "step": 1015 }, { "epoch": 0.5780938833570413, "grad_norm": 0.6090911030769348, "learning_rate": 3.435788596323789e-05, "loss": 1.528, "step": 1016 }, { "epoch": 0.5786628733997156, "grad_norm": 0.5943465828895569, "learning_rate": 3.41865650718396e-05, "loss": 1.4567, "step": 1017 }, { "epoch": 0.5792318634423897, "grad_norm": 0.5948119163513184, "learning_rate": 3.4015584273705425e-05, "loss": 1.4926, "step": 1018 }, { "epoch": 0.579800853485064, "grad_norm": 0.6115890741348267, "learning_rate": 3.384494445238471e-05, "loss": 1.4113, "step": 1019 }, { "epoch": 0.5803698435277382, "grad_norm": 0.5682458281517029, "learning_rate": 3.367464648966471e-05, "loss": 1.514, "step": 1020 }, { "epoch": 0.5809388335704125, "grad_norm": 0.5994877219200134, "learning_rate": 3.350469126556627e-05, "loss": 1.495, "step": 1021 }, { "epoch": 0.5815078236130867, "grad_norm": 0.5887535810470581, "learning_rate": 3.333507965833905e-05, "loss": 1.6428, "step": 1022 }, { "epoch": 0.582076813655761, "grad_norm": 0.5758301615715027, "learning_rate": 3.316581254445701e-05, "loss": 1.4076, "step": 1023 }, { "epoch": 0.5826458036984353, "grad_norm": 0.6117954850196838, "learning_rate": 3.299689079861408e-05, "loss": 1.4471, "step": 1024 }, { "epoch": 0.5832147937411095, "grad_norm": 0.6079879999160767, "learning_rate": 3.2828315293719245e-05, "loss": 1.485, "step": 1025 }, { "epoch": 0.5837837837837838, "grad_norm": 0.5936009287834167, "learning_rate": 3.266008690089253e-05, "loss": 1.6109, "step": 1026 }, { "epoch": 0.584352773826458, "grad_norm": 0.5736754536628723, "learning_rate": 3.24922064894601e-05, "loss": 1.4451, "step": 1027 }, { "epoch": 0.5849217638691323, "grad_norm": 0.5830667018890381, "learning_rate": 3.23246749269499e-05, "loss": 1.499, "step": 1028 }, { "epoch": 0.5854907539118065, "grad_norm": 0.5929978489875793, "learning_rate": 3.2157493079087343e-05, "loss": 1.5964, "step": 1029 }, { "epoch": 0.5860597439544808, "grad_norm": 0.5748528242111206, "learning_rate": 3.1990661809790445e-05, "loss": 1.3425, "step": 1030 }, { "epoch": 0.5866287339971551, "grad_norm": 0.6261157393455505, "learning_rate": 3.18241819811658e-05, "loss": 1.4458, "step": 1031 }, { "epoch": 0.5871977240398293, "grad_norm": 0.5736514925956726, "learning_rate": 3.165805445350383e-05, "loss": 1.3948, "step": 1032 }, { "epoch": 0.5877667140825036, "grad_norm": 0.6165857911109924, "learning_rate": 3.149228008527437e-05, "loss": 1.6043, "step": 1033 }, { "epoch": 0.5883357041251778, "grad_norm": 0.6109797954559326, "learning_rate": 3.132685973312251e-05, "loss": 1.5376, "step": 1034 }, { "epoch": 0.5889046941678521, "grad_norm": 0.5716987252235413, "learning_rate": 3.116179425186361e-05, "loss": 1.3554, "step": 1035 }, { "epoch": 0.5894736842105263, "grad_norm": 0.6563665866851807, "learning_rate": 3.099708449447956e-05, "loss": 1.4934, "step": 1036 }, { "epoch": 0.5900426742532006, "grad_norm": 0.6072697043418884, "learning_rate": 3.083273131211382e-05, "loss": 1.3181, "step": 1037 }, { "epoch": 0.5906116642958749, "grad_norm": 0.5769975781440735, "learning_rate": 3.066873555406727e-05, "loss": 1.5376, "step": 1038 }, { "epoch": 0.591180654338549, "grad_norm": 0.58552485704422, "learning_rate": 3.0505098067793937e-05, "loss": 1.3483, "step": 1039 }, { "epoch": 0.5917496443812233, "grad_norm": 0.6377474069595337, "learning_rate": 3.0341819698896202e-05, "loss": 1.6044, "step": 1040 }, { "epoch": 0.5923186344238975, "grad_norm": 0.5746393203735352, "learning_rate": 3.017890129112094e-05, "loss": 1.5081, "step": 1041 }, { "epoch": 0.5928876244665718, "grad_norm": 0.5879509449005127, "learning_rate": 3.0016343686354775e-05, "loss": 1.7884, "step": 1042 }, { "epoch": 0.5934566145092461, "grad_norm": 0.5871498584747314, "learning_rate": 2.9854147724619886e-05, "loss": 1.4425, "step": 1043 }, { "epoch": 0.5940256045519203, "grad_norm": 0.6417199373245239, "learning_rate": 2.9692314244069764e-05, "loss": 1.4729, "step": 1044 }, { "epoch": 0.5940256045519203, "eval_loss": 1.4645270109176636, "eval_runtime": 16.2716, "eval_samples_per_second": 45.478, "eval_steps_per_second": 22.739, "step": 1044 }, { "epoch": 0.5945945945945946, "grad_norm": 0.5834308862686157, "learning_rate": 2.9530844080984565e-05, "loss": 1.4174, "step": 1045 }, { "epoch": 0.5951635846372688, "grad_norm": 0.5811535120010376, "learning_rate": 2.9369738069767107e-05, "loss": 1.2859, "step": 1046 }, { "epoch": 0.5957325746799431, "grad_norm": 0.6040303707122803, "learning_rate": 2.920899704293849e-05, "loss": 1.7526, "step": 1047 }, { "epoch": 0.5963015647226173, "grad_norm": 0.5936810970306396, "learning_rate": 2.9048621831133616e-05, "loss": 1.3031, "step": 1048 }, { "epoch": 0.5968705547652916, "grad_norm": 0.5825332999229431, "learning_rate": 2.8888613263097153e-05, "loss": 1.3483, "step": 1049 }, { "epoch": 0.5974395448079659, "grad_norm": 0.6082255244255066, "learning_rate": 2.8728972165679067e-05, "loss": 1.528, "step": 1050 }, { "epoch": 0.5980085348506401, "grad_norm": 0.594572126865387, "learning_rate": 2.8569699363830316e-05, "loss": 1.5789, "step": 1051 }, { "epoch": 0.5985775248933144, "grad_norm": 0.6006420850753784, "learning_rate": 2.8410795680598846e-05, "loss": 1.5638, "step": 1052 }, { "epoch": 0.5991465149359886, "grad_norm": 0.5715523958206177, "learning_rate": 2.825226193712507e-05, "loss": 1.5222, "step": 1053 }, { "epoch": 0.5997155049786629, "grad_norm": 0.5750184059143066, "learning_rate": 2.8094098952637692e-05, "loss": 1.5154, "step": 1054 }, { "epoch": 0.6002844950213371, "grad_norm": 0.5836694240570068, "learning_rate": 2.793630754444967e-05, "loss": 1.4624, "step": 1055 }, { "epoch": 0.6008534850640114, "grad_norm": 0.5644353628158569, "learning_rate": 2.7778888527953572e-05, "loss": 1.564, "step": 1056 }, { "epoch": 0.6014224751066857, "grad_norm": 0.6327478885650635, "learning_rate": 2.762184271661785e-05, "loss": 1.4707, "step": 1057 }, { "epoch": 0.6019914651493599, "grad_norm": 0.5783342719078064, "learning_rate": 2.746517092198231e-05, "loss": 1.4888, "step": 1058 }, { "epoch": 0.6025604551920342, "grad_norm": 0.5796740651130676, "learning_rate": 2.730887395365397e-05, "loss": 1.5201, "step": 1059 }, { "epoch": 0.6031294452347084, "grad_norm": 0.5543321967124939, "learning_rate": 2.715295261930306e-05, "loss": 1.4378, "step": 1060 }, { "epoch": 0.6036984352773827, "grad_norm": 0.6244597434997559, "learning_rate": 2.699740772465851e-05, "loss": 1.4242, "step": 1061 }, { "epoch": 0.604267425320057, "grad_norm": 0.5890554785728455, "learning_rate": 2.6842240073504165e-05, "loss": 1.4732, "step": 1062 }, { "epoch": 0.6048364153627311, "grad_norm": 0.5934953689575195, "learning_rate": 2.668745046767436e-05, "loss": 1.517, "step": 1063 }, { "epoch": 0.6054054054054054, "grad_norm": 0.5716105103492737, "learning_rate": 2.6533039707049834e-05, "loss": 1.2859, "step": 1064 }, { "epoch": 0.6059743954480796, "grad_norm": 0.5798661708831787, "learning_rate": 2.63790085895538e-05, "loss": 1.5015, "step": 1065 }, { "epoch": 0.6065433854907539, "grad_norm": 0.600385844707489, "learning_rate": 2.6225357911147385e-05, "loss": 1.4027, "step": 1066 }, { "epoch": 0.6071123755334281, "grad_norm": 0.5749977231025696, "learning_rate": 2.6072088465826038e-05, "loss": 1.5876, "step": 1067 }, { "epoch": 0.6076813655761024, "grad_norm": 0.5585724711418152, "learning_rate": 2.591920104561503e-05, "loss": 1.4756, "step": 1068 }, { "epoch": 0.6082503556187767, "grad_norm": 0.5597994327545166, "learning_rate": 2.5766696440565496e-05, "loss": 1.4621, "step": 1069 }, { "epoch": 0.6088193456614509, "grad_norm": 0.5746705532073975, "learning_rate": 2.5614575438750522e-05, "loss": 1.1686, "step": 1070 }, { "epoch": 0.6093883357041252, "grad_norm": 0.6218065023422241, "learning_rate": 2.546283882626065e-05, "loss": 1.4789, "step": 1071 }, { "epoch": 0.6099573257467994, "grad_norm": 0.6003706455230713, "learning_rate": 2.5311487387200306e-05, "loss": 1.3938, "step": 1072 }, { "epoch": 0.6105263157894737, "grad_norm": 0.6234976649284363, "learning_rate": 2.516052190368341e-05, "loss": 1.4399, "step": 1073 }, { "epoch": 0.6110953058321479, "grad_norm": 0.5926810503005981, "learning_rate": 2.500994315582943e-05, "loss": 1.3032, "step": 1074 }, { "epoch": 0.6116642958748222, "grad_norm": 0.5906057953834534, "learning_rate": 2.485975192175949e-05, "loss": 1.2748, "step": 1075 }, { "epoch": 0.6122332859174965, "grad_norm": 0.5761781334877014, "learning_rate": 2.4709948977592034e-05, "loss": 1.4486, "step": 1076 }, { "epoch": 0.6128022759601707, "grad_norm": 0.6170504093170166, "learning_rate": 2.4560535097439108e-05, "loss": 1.5943, "step": 1077 }, { "epoch": 0.613371266002845, "grad_norm": 0.6018140912055969, "learning_rate": 2.4411511053402302e-05, "loss": 1.5996, "step": 1078 }, { "epoch": 0.6139402560455192, "grad_norm": 0.5538153052330017, "learning_rate": 2.4262877615568626e-05, "loss": 1.4874, "step": 1079 }, { "epoch": 0.6145092460881935, "grad_norm": 0.5843609571456909, "learning_rate": 2.411463555200667e-05, "loss": 1.269, "step": 1080 }, { "epoch": 0.6150782361308678, "grad_norm": 0.5559793710708618, "learning_rate": 2.3966785628762546e-05, "loss": 1.5796, "step": 1081 }, { "epoch": 0.615647226173542, "grad_norm": 0.5636264085769653, "learning_rate": 2.381932860985596e-05, "loss": 1.2805, "step": 1082 }, { "epoch": 0.6162162162162163, "grad_norm": 0.6079363226890564, "learning_rate": 2.3672265257276383e-05, "loss": 1.5295, "step": 1083 }, { "epoch": 0.6167852062588904, "grad_norm": 0.6165335178375244, "learning_rate": 2.352559633097885e-05, "loss": 1.5551, "step": 1084 }, { "epoch": 0.6173541963015647, "grad_norm": 0.6137623190879822, "learning_rate": 2.337932258888028e-05, "loss": 1.4585, "step": 1085 }, { "epoch": 0.6179231863442389, "grad_norm": 0.5806836485862732, "learning_rate": 2.3233444786855407e-05, "loss": 1.5539, "step": 1086 }, { "epoch": 0.6184921763869132, "grad_norm": 0.5768011212348938, "learning_rate": 2.308796367873296e-05, "loss": 1.4415, "step": 1087 }, { "epoch": 0.6190611664295875, "grad_norm": 0.5644312500953674, "learning_rate": 2.294288001629177e-05, "loss": 1.4668, "step": 1088 }, { "epoch": 0.6196301564722617, "grad_norm": 0.5748885869979858, "learning_rate": 2.2798194549256792e-05, "loss": 1.3066, "step": 1089 }, { "epoch": 0.620199146514936, "grad_norm": 0.5609626770019531, "learning_rate": 2.2653908025295323e-05, "loss": 1.3779, "step": 1090 }, { "epoch": 0.6207681365576102, "grad_norm": 0.5691306591033936, "learning_rate": 2.251002119001312e-05, "loss": 1.442, "step": 1091 }, { "epoch": 0.6213371266002845, "grad_norm": 0.589314877986908, "learning_rate": 2.2366534786950467e-05, "loss": 1.4482, "step": 1092 }, { "epoch": 0.6219061166429587, "grad_norm": 0.5820268392562866, "learning_rate": 2.222344955757851e-05, "loss": 1.4195, "step": 1093 }, { "epoch": 0.622475106685633, "grad_norm": 0.6211294531822205, "learning_rate": 2.2080766241295235e-05, "loss": 1.549, "step": 1094 }, { "epoch": 0.6230440967283073, "grad_norm": 0.6313804984092712, "learning_rate": 2.1938485575421752e-05, "loss": 1.6662, "step": 1095 }, { "epoch": 0.6236130867709815, "grad_norm": 0.5776501297950745, "learning_rate": 2.1796608295198462e-05, "loss": 1.3551, "step": 1096 }, { "epoch": 0.6241820768136558, "grad_norm": 0.5959988236427307, "learning_rate": 2.165513513378121e-05, "loss": 1.4321, "step": 1097 }, { "epoch": 0.62475106685633, "grad_norm": 0.5878854393959045, "learning_rate": 2.1514066822237665e-05, "loss": 1.428, "step": 1098 }, { "epoch": 0.6253200568990043, "grad_norm": 0.5653113722801208, "learning_rate": 2.137340408954329e-05, "loss": 1.3464, "step": 1099 }, { "epoch": 0.6258890469416786, "grad_norm": 0.5969840884208679, "learning_rate": 2.1233147662577767e-05, "loss": 1.4497, "step": 1100 }, { "epoch": 0.6264580369843528, "grad_norm": 0.5675022602081299, "learning_rate": 2.1093298266121165e-05, "loss": 1.4289, "step": 1101 }, { "epoch": 0.6270270270270271, "grad_norm": 0.6396809816360474, "learning_rate": 2.0953856622850176e-05, "loss": 1.4908, "step": 1102 }, { "epoch": 0.6275960170697013, "grad_norm": 0.5843429565429688, "learning_rate": 2.081482345333452e-05, "loss": 1.6213, "step": 1103 }, { "epoch": 0.6281650071123756, "grad_norm": 0.5792785882949829, "learning_rate": 2.0676199476033e-05, "loss": 1.57, "step": 1104 }, { "epoch": 0.6287339971550497, "grad_norm": 0.6015857458114624, "learning_rate": 2.053798540728995e-05, "loss": 1.5818, "step": 1105 }, { "epoch": 0.629302987197724, "grad_norm": 0.5723267197608948, "learning_rate": 2.0400181961331478e-05, "loss": 1.3799, "step": 1106 }, { "epoch": 0.6298719772403983, "grad_norm": 0.6322827339172363, "learning_rate": 2.0262789850261798e-05, "loss": 1.4456, "step": 1107 }, { "epoch": 0.6304409672830725, "grad_norm": 0.6475574970245361, "learning_rate": 2.012580978405949e-05, "loss": 1.6081, "step": 1108 }, { "epoch": 0.6310099573257468, "grad_norm": 0.5577263832092285, "learning_rate": 1.9989242470573975e-05, "loss": 1.319, "step": 1109 }, { "epoch": 0.631578947368421, "grad_norm": 0.5767825245857239, "learning_rate": 1.9853088615521663e-05, "loss": 1.1708, "step": 1110 }, { "epoch": 0.6321479374110953, "grad_norm": 0.5954525470733643, "learning_rate": 1.9717348922482458e-05, "loss": 1.3891, "step": 1111 }, { "epoch": 0.6327169274537695, "grad_norm": 0.6102060079574585, "learning_rate": 1.9582024092896033e-05, "loss": 1.3531, "step": 1112 }, { "epoch": 0.6332859174964438, "grad_norm": 0.5929975509643555, "learning_rate": 1.9447114826058233e-05, "loss": 1.5927, "step": 1113 }, { "epoch": 0.6338549075391181, "grad_norm": 0.5874947905540466, "learning_rate": 1.931262181911754e-05, "loss": 1.4828, "step": 1114 }, { "epoch": 0.6344238975817923, "grad_norm": 0.5605891346931458, "learning_rate": 1.9178545767071322e-05, "loss": 1.655, "step": 1115 }, { "epoch": 0.6349928876244666, "grad_norm": 0.5770267248153687, "learning_rate": 1.9044887362762343e-05, "loss": 1.3424, "step": 1116 }, { "epoch": 0.6355618776671408, "grad_norm": 0.639620840549469, "learning_rate": 1.8911647296875147e-05, "loss": 1.3701, "step": 1117 }, { "epoch": 0.6361308677098151, "grad_norm": 0.6051335334777832, "learning_rate": 1.87788262579325e-05, "loss": 1.3458, "step": 1118 }, { "epoch": 0.6366998577524894, "grad_norm": 0.5964054465293884, "learning_rate": 1.8646424932291896e-05, "loss": 1.5532, "step": 1119 }, { "epoch": 0.6372688477951636, "grad_norm": 0.6007367968559265, "learning_rate": 1.851444400414185e-05, "loss": 1.5373, "step": 1120 }, { "epoch": 0.6378378378378379, "grad_norm": 0.5970923900604248, "learning_rate": 1.8382884155498514e-05, "loss": 1.5256, "step": 1121 }, { "epoch": 0.6384068278805121, "grad_norm": 0.6180837750434875, "learning_rate": 1.8251746066202058e-05, "loss": 1.4781, "step": 1122 }, { "epoch": 0.6389758179231864, "grad_norm": 0.6046369671821594, "learning_rate": 1.812103041391322e-05, "loss": 1.4899, "step": 1123 }, { "epoch": 0.6395448079658606, "grad_norm": 0.5703504085540771, "learning_rate": 1.799073787410982e-05, "loss": 1.5633, "step": 1124 }, { "epoch": 0.6401137980085349, "grad_norm": 0.6019449830055237, "learning_rate": 1.786086912008316e-05, "loss": 1.3685, "step": 1125 }, { "epoch": 0.6406827880512092, "grad_norm": 0.5852835774421692, "learning_rate": 1.773142482293464e-05, "loss": 1.5065, "step": 1126 }, { "epoch": 0.6412517780938833, "grad_norm": 0.5664365887641907, "learning_rate": 1.7602405651572275e-05, "loss": 1.5823, "step": 1127 }, { "epoch": 0.6418207681365576, "grad_norm": 0.5778409242630005, "learning_rate": 1.747381227270718e-05, "loss": 1.4294, "step": 1128 }, { "epoch": 0.6423897581792318, "grad_norm": 0.5901049375534058, "learning_rate": 1.734564535085028e-05, "loss": 1.3996, "step": 1129 }, { "epoch": 0.6429587482219061, "grad_norm": 0.6099653244018555, "learning_rate": 1.721790554830869e-05, "loss": 1.5873, "step": 1130 }, { "epoch": 0.6435277382645803, "grad_norm": 0.5981472730636597, "learning_rate": 1.7090593525182287e-05, "loss": 1.5958, "step": 1131 }, { "epoch": 0.6440967283072546, "grad_norm": 0.6043581366539001, "learning_rate": 1.6963709939360585e-05, "loss": 1.561, "step": 1132 }, { "epoch": 0.6446657183499289, "grad_norm": 0.6230269074440002, "learning_rate": 1.6837255446518964e-05, "loss": 1.4484, "step": 1133 }, { "epoch": 0.6452347083926031, "grad_norm": 0.579458475112915, "learning_rate": 1.671123070011551e-05, "loss": 1.597, "step": 1134 }, { "epoch": 0.6458036984352774, "grad_norm": 0.5982540845870972, "learning_rate": 1.6585636351387635e-05, "loss": 1.5299, "step": 1135 }, { "epoch": 0.6463726884779516, "grad_norm": 0.6335290670394897, "learning_rate": 1.646047304934851e-05, "loss": 1.6529, "step": 1136 }, { "epoch": 0.6469416785206259, "grad_norm": 0.580467164516449, "learning_rate": 1.6335741440784035e-05, "loss": 1.5459, "step": 1137 }, { "epoch": 0.6475106685633002, "grad_norm": 0.5840801000595093, "learning_rate": 1.621144217024918e-05, "loss": 1.3808, "step": 1138 }, { "epoch": 0.6480796586059744, "grad_norm": 0.592555582523346, "learning_rate": 1.608757588006483e-05, "loss": 1.5013, "step": 1139 }, { "epoch": 0.6486486486486487, "grad_norm": 0.5938240885734558, "learning_rate": 1.596414321031452e-05, "loss": 1.3971, "step": 1140 }, { "epoch": 0.6492176386913229, "grad_norm": 0.5719125270843506, "learning_rate": 1.5841144798840855e-05, "loss": 1.4372, "step": 1141 }, { "epoch": 0.6497866287339972, "grad_norm": 0.6199617981910706, "learning_rate": 1.5718581281242572e-05, "loss": 1.6019, "step": 1142 }, { "epoch": 0.6503556187766714, "grad_norm": 0.594205379486084, "learning_rate": 1.5596453290870982e-05, "loss": 1.5322, "step": 1143 }, { "epoch": 0.6509246088193457, "grad_norm": 0.6198005676269531, "learning_rate": 1.5474761458826793e-05, "loss": 1.4777, "step": 1144 }, { "epoch": 0.65149359886202, "grad_norm": 0.567058265209198, "learning_rate": 1.5353506413956932e-05, "loss": 1.5108, "step": 1145 }, { "epoch": 0.6520625889046942, "grad_norm": 0.5950532555580139, "learning_rate": 1.5232688782851068e-05, "loss": 1.5038, "step": 1146 }, { "epoch": 0.6526315789473685, "grad_norm": 0.6178238987922668, "learning_rate": 1.511230918983867e-05, "loss": 1.5458, "step": 1147 }, { "epoch": 0.6532005689900426, "grad_norm": 0.5962685346603394, "learning_rate": 1.4992368256985546e-05, "loss": 1.432, "step": 1148 }, { "epoch": 0.6537695590327169, "grad_norm": 0.5979620218276978, "learning_rate": 1.4872866604090696e-05, "loss": 1.5035, "step": 1149 }, { "epoch": 0.6543385490753911, "grad_norm": 0.5819264650344849, "learning_rate": 1.475380484868325e-05, "loss": 1.4169, "step": 1150 }, { "epoch": 0.6549075391180654, "grad_norm": 0.6469606757164001, "learning_rate": 1.4635183606018943e-05, "loss": 1.4442, "step": 1151 }, { "epoch": 0.6554765291607397, "grad_norm": 0.5789610147476196, "learning_rate": 1.451700348907734e-05, "loss": 1.122, "step": 1152 }, { "epoch": 0.6560455192034139, "grad_norm": 0.5797150135040283, "learning_rate": 1.4399265108558379e-05, "loss": 1.5795, "step": 1153 }, { "epoch": 0.6566145092460882, "grad_norm": 0.6014512777328491, "learning_rate": 1.4281969072879298e-05, "loss": 1.3673, "step": 1154 }, { "epoch": 0.6571834992887624, "grad_norm": 0.566061794757843, "learning_rate": 1.4165115988171596e-05, "loss": 1.4255, "step": 1155 }, { "epoch": 0.6577524893314367, "grad_norm": 0.599322497844696, "learning_rate": 1.4048706458277672e-05, "loss": 1.4538, "step": 1156 }, { "epoch": 0.658321479374111, "grad_norm": 0.6258883476257324, "learning_rate": 1.3932741084747913e-05, "loss": 1.5197, "step": 1157 }, { "epoch": 0.6588904694167852, "grad_norm": 0.603754460811615, "learning_rate": 1.3817220466837566e-05, "loss": 1.5596, "step": 1158 }, { "epoch": 0.6594594594594595, "grad_norm": 0.5680553317070007, "learning_rate": 1.3702145201503458e-05, "loss": 1.3882, "step": 1159 }, { "epoch": 0.6600284495021337, "grad_norm": 0.6317921280860901, "learning_rate": 1.3587515883401202e-05, "loss": 1.4051, "step": 1160 }, { "epoch": 0.660597439544808, "grad_norm": 0.5998348593711853, "learning_rate": 1.3473333104881792e-05, "loss": 1.5309, "step": 1161 }, { "epoch": 0.6611664295874822, "grad_norm": 0.5694242715835571, "learning_rate": 1.3359597455988803e-05, "loss": 1.4933, "step": 1162 }, { "epoch": 0.6617354196301565, "grad_norm": 0.6193349361419678, "learning_rate": 1.3246309524455291e-05, "loss": 1.5781, "step": 1163 }, { "epoch": 0.6623044096728308, "grad_norm": 0.5579991340637207, "learning_rate": 1.3133469895700634e-05, "loss": 1.3616, "step": 1164 }, { "epoch": 0.662873399715505, "grad_norm": 0.5790702104568481, "learning_rate": 1.3021079152827631e-05, "loss": 1.3994, "step": 1165 }, { "epoch": 0.6634423897581793, "grad_norm": 0.5730209946632385, "learning_rate": 1.2909137876619448e-05, "loss": 1.3269, "step": 1166 }, { "epoch": 0.6640113798008535, "grad_norm": 0.6066268086433411, "learning_rate": 1.2797646645536566e-05, "loss": 1.6239, "step": 1167 }, { "epoch": 0.6645803698435278, "grad_norm": 0.649182140827179, "learning_rate": 1.2686606035713944e-05, "loss": 1.7304, "step": 1168 }, { "epoch": 0.6651493598862019, "grad_norm": 0.6383649110794067, "learning_rate": 1.2576016620957853e-05, "loss": 1.4477, "step": 1169 }, { "epoch": 0.6657183499288762, "grad_norm": 0.5763673782348633, "learning_rate": 1.2465878972743028e-05, "loss": 1.4846, "step": 1170 }, { "epoch": 0.6662873399715505, "grad_norm": 0.5865679383277893, "learning_rate": 1.2356193660209681e-05, "loss": 1.5687, "step": 1171 }, { "epoch": 0.6668563300142247, "grad_norm": 0.5898412466049194, "learning_rate": 1.2246961250160527e-05, "loss": 1.5227, "step": 1172 }, { "epoch": 0.667425320056899, "grad_norm": 0.6910015344619751, "learning_rate": 1.2138182307057987e-05, "loss": 1.245, "step": 1173 }, { "epoch": 0.6679943100995732, "grad_norm": 0.5660498142242432, "learning_rate": 1.2029857393021094e-05, "loss": 1.2887, "step": 1174 }, { "epoch": 0.6685633001422475, "grad_norm": 0.5966072082519531, "learning_rate": 1.1921987067822672e-05, "loss": 1.3417, "step": 1175 }, { "epoch": 0.6691322901849218, "grad_norm": 0.5854772329330444, "learning_rate": 1.1814571888886483e-05, "loss": 1.474, "step": 1176 }, { "epoch": 0.669701280227596, "grad_norm": 0.6000230312347412, "learning_rate": 1.1707612411284253e-05, "loss": 1.4276, "step": 1177 }, { "epoch": 0.6702702702702703, "grad_norm": 0.5757988691329956, "learning_rate": 1.1601109187732928e-05, "loss": 1.5459, "step": 1178 }, { "epoch": 0.6708392603129445, "grad_norm": 0.5930068492889404, "learning_rate": 1.149506276859167e-05, "loss": 1.4149, "step": 1179 }, { "epoch": 0.6714082503556188, "grad_norm": 0.5741011500358582, "learning_rate": 1.1389473701859121e-05, "loss": 1.2504, "step": 1180 }, { "epoch": 0.671977240398293, "grad_norm": 0.588571310043335, "learning_rate": 1.1284342533170545e-05, "loss": 1.6301, "step": 1181 }, { "epoch": 0.6725462304409673, "grad_norm": 0.5500454306602478, "learning_rate": 1.1179669805794968e-05, "loss": 1.4952, "step": 1182 }, { "epoch": 0.6731152204836416, "grad_norm": 0.5811514854431152, "learning_rate": 1.1075456060632472e-05, "loss": 1.447, "step": 1183 }, { "epoch": 0.6736842105263158, "grad_norm": 0.6315092444419861, "learning_rate": 1.0971701836211268e-05, "loss": 1.5707, "step": 1184 }, { "epoch": 0.6742532005689901, "grad_norm": 0.6309195756912231, "learning_rate": 1.0868407668684998e-05, "loss": 1.2443, "step": 1185 }, { "epoch": 0.6748221906116643, "grad_norm": 0.5840954780578613, "learning_rate": 1.0765574091829933e-05, "loss": 1.4682, "step": 1186 }, { "epoch": 0.6753911806543386, "grad_norm": 0.6113706827163696, "learning_rate": 1.0663201637042252e-05, "loss": 1.4292, "step": 1187 }, { "epoch": 0.6759601706970128, "grad_norm": 0.6047906279563904, "learning_rate": 1.0561290833335224e-05, "loss": 1.627, "step": 1188 }, { "epoch": 0.676529160739687, "grad_norm": 0.5755859613418579, "learning_rate": 1.04598422073366e-05, "loss": 1.3449, "step": 1189 }, { "epoch": 0.6770981507823614, "grad_norm": 0.5938130021095276, "learning_rate": 1.0358856283285722e-05, "loss": 1.389, "step": 1190 }, { "epoch": 0.6776671408250355, "grad_norm": 0.5975162386894226, "learning_rate": 1.0258333583030955e-05, "loss": 1.4868, "step": 1191 }, { "epoch": 0.6782361308677098, "grad_norm": 0.6362975239753723, "learning_rate": 1.0158274626026931e-05, "loss": 1.6409, "step": 1192 }, { "epoch": 0.678805120910384, "grad_norm": 0.6175844669342041, "learning_rate": 1.0058679929331827e-05, "loss": 1.4914, "step": 1193 }, { "epoch": 0.6793741109530583, "grad_norm": 0.5870533585548401, "learning_rate": 9.959550007604835e-06, "loss": 1.3655, "step": 1194 }, { "epoch": 0.6799431009957326, "grad_norm": 0.5993149280548096, "learning_rate": 9.860885373103324e-06, "loss": 1.4203, "step": 1195 }, { "epoch": 0.6805120910384068, "grad_norm": 0.5798912048339844, "learning_rate": 9.7626865356803e-06, "loss": 1.4378, "step": 1196 }, { "epoch": 0.6810810810810811, "grad_norm": 0.5729113221168518, "learning_rate": 9.664954002781745e-06, "loss": 1.4054, "step": 1197 }, { "epoch": 0.6816500711237553, "grad_norm": 0.6329131126403809, "learning_rate": 9.567688279443964e-06, "loss": 1.4381, "step": 1198 }, { "epoch": 0.6822190611664296, "grad_norm": 0.6088592410087585, "learning_rate": 9.4708898682911e-06, "loss": 1.4094, "step": 1199 }, { "epoch": 0.6827880512091038, "grad_norm": 0.5889382362365723, "learning_rate": 9.374559269532346e-06, "loss": 1.5365, "step": 1200 }, { "epoch": 0.6833570412517781, "grad_norm": 0.6487043499946594, "learning_rate": 9.27869698095951e-06, "loss": 1.4747, "step": 1201 }, { "epoch": 0.6839260312944524, "grad_norm": 0.6006666421890259, "learning_rate": 9.183303497944361e-06, "loss": 1.3953, "step": 1202 }, { "epoch": 0.6844950213371266, "grad_norm": 0.5925318002700806, "learning_rate": 9.088379313436113e-06, "loss": 1.5679, "step": 1203 }, { "epoch": 0.6850640113798009, "grad_norm": 0.5964149832725525, "learning_rate": 8.993924917958874e-06, "loss": 1.4872, "step": 1204 }, { "epoch": 0.6856330014224751, "grad_norm": 0.5567532777786255, "learning_rate": 8.899940799609096e-06, "loss": 1.3922, "step": 1205 }, { "epoch": 0.6862019914651494, "grad_norm": 0.5803432464599609, "learning_rate": 8.806427444053033e-06, "loss": 1.319, "step": 1206 }, { "epoch": 0.6867709815078236, "grad_norm": 0.583640456199646, "learning_rate": 8.713385334524283e-06, "loss": 1.4564, "step": 1207 }, { "epoch": 0.6873399715504979, "grad_norm": 0.6316723227500916, "learning_rate": 8.620814951821232e-06, "loss": 1.4586, "step": 1208 }, { "epoch": 0.6879089615931722, "grad_norm": 0.5926545262336731, "learning_rate": 8.528716774304658e-06, "loss": 1.5008, "step": 1209 }, { "epoch": 0.6884779516358464, "grad_norm": 0.5738364458084106, "learning_rate": 8.43709127789517e-06, "loss": 1.3766, "step": 1210 }, { "epoch": 0.6890469416785207, "grad_norm": 0.5985202193260193, "learning_rate": 8.345938936070718e-06, "loss": 1.5175, "step": 1211 }, { "epoch": 0.6896159317211948, "grad_norm": 0.6196452379226685, "learning_rate": 8.255260219864324e-06, "loss": 1.6161, "step": 1212 }, { "epoch": 0.6901849217638691, "grad_norm": 0.6303586959838867, "learning_rate": 8.16505559786146e-06, "loss": 1.6054, "step": 1213 }, { "epoch": 0.6907539118065433, "grad_norm": 0.5856702327728271, "learning_rate": 8.07532553619772e-06, "loss": 1.5131, "step": 1214 }, { "epoch": 0.6913229018492176, "grad_norm": 0.5996472239494324, "learning_rate": 7.986070498556397e-06, "loss": 1.3462, "step": 1215 }, { "epoch": 0.6918918918918919, "grad_norm": 0.6016293168067932, "learning_rate": 7.897290946166037e-06, "loss": 1.3177, "step": 1216 }, { "epoch": 0.6924608819345661, "grad_norm": 0.5805103182792664, "learning_rate": 7.808987337798158e-06, "loss": 1.1701, "step": 1217 }, { "epoch": 0.6930298719772404, "grad_norm": 0.5823555588722229, "learning_rate": 7.721160129764792e-06, "loss": 1.275, "step": 1218 }, { "epoch": 0.6935988620199146, "grad_norm": 0.5881485939025879, "learning_rate": 7.633809775916135e-06, "loss": 1.3304, "step": 1219 }, { "epoch": 0.6941678520625889, "grad_norm": 0.610157310962677, "learning_rate": 7.546936727638298e-06, "loss": 1.325, "step": 1220 }, { "epoch": 0.6947368421052632, "grad_norm": 0.6015896797180176, "learning_rate": 7.460541433850788e-06, "loss": 1.4739, "step": 1221 }, { "epoch": 0.6953058321479374, "grad_norm": 0.6073941588401794, "learning_rate": 7.374624341004388e-06, "loss": 1.6308, "step": 1222 }, { "epoch": 0.6958748221906117, "grad_norm": 0.5897748470306396, "learning_rate": 7.289185893078721e-06, "loss": 1.4808, "step": 1223 }, { "epoch": 0.6964438122332859, "grad_norm": 0.6318244338035583, "learning_rate": 7.204226531579994e-06, "loss": 1.5134, "step": 1224 }, { "epoch": 0.6970128022759602, "grad_norm": 0.5809091329574585, "learning_rate": 7.119746695538765e-06, "loss": 1.4117, "step": 1225 }, { "epoch": 0.6975817923186344, "grad_norm": 0.6108141541481018, "learning_rate": 7.0357468215075275e-06, "loss": 1.3201, "step": 1226 }, { "epoch": 0.6981507823613087, "grad_norm": 0.566813051700592, "learning_rate": 6.952227343558671e-06, "loss": 1.502, "step": 1227 }, { "epoch": 0.698719772403983, "grad_norm": 0.5999999046325684, "learning_rate": 6.869188693282036e-06, "loss": 1.3958, "step": 1228 }, { "epoch": 0.6992887624466572, "grad_norm": 0.6325690150260925, "learning_rate": 6.786631299782797e-06, "loss": 1.4682, "step": 1229 }, { "epoch": 0.6998577524893315, "grad_norm": 0.5865020155906677, "learning_rate": 6.704555589679262e-06, "loss": 1.4662, "step": 1230 }, { "epoch": 0.7004267425320057, "grad_norm": 0.5978051424026489, "learning_rate": 6.622961987100518e-06, "loss": 1.4549, "step": 1231 }, { "epoch": 0.70099573257468, "grad_norm": 0.6172093152999878, "learning_rate": 6.541850913684444e-06, "loss": 1.52, "step": 1232 }, { "epoch": 0.7015647226173541, "grad_norm": 0.6080772280693054, "learning_rate": 6.461222788575394e-06, "loss": 1.5765, "step": 1233 }, { "epoch": 0.7021337126600284, "grad_norm": 0.6048703193664551, "learning_rate": 6.3810780284220495e-06, "loss": 1.6723, "step": 1234 }, { "epoch": 0.7027027027027027, "grad_norm": 0.5950552225112915, "learning_rate": 6.301417047375347e-06, "loss": 1.4492, "step": 1235 }, { "epoch": 0.7032716927453769, "grad_norm": 0.5936811566352844, "learning_rate": 6.222240257086176e-06, "loss": 1.4721, "step": 1236 }, { "epoch": 0.7038406827880512, "grad_norm": 0.5990265011787415, "learning_rate": 6.143548066703475e-06, "loss": 1.3644, "step": 1237 }, { "epoch": 0.7044096728307254, "grad_norm": 0.5738005638122559, "learning_rate": 6.065340882871906e-06, "loss": 1.4847, "step": 1238 }, { "epoch": 0.7049786628733997, "grad_norm": 0.5998217463493347, "learning_rate": 5.9876191097298475e-06, "loss": 1.4917, "step": 1239 }, { "epoch": 0.705547652916074, "grad_norm": 0.5693299174308777, "learning_rate": 5.910383148907395e-06, "loss": 1.3934, "step": 1240 }, { "epoch": 0.7061166429587482, "grad_norm": 0.5998255014419556, "learning_rate": 5.8336333995240526e-06, "loss": 1.6348, "step": 1241 }, { "epoch": 0.7066856330014225, "grad_norm": 0.6056730151176453, "learning_rate": 5.757370258186889e-06, "loss": 1.4748, "step": 1242 }, { "epoch": 0.7072546230440967, "grad_norm": 0.6141417622566223, "learning_rate": 5.6815941189884315e-06, "loss": 1.4371, "step": 1243 }, { "epoch": 0.707823613086771, "grad_norm": 0.5924522280693054, "learning_rate": 5.606305373504528e-06, "loss": 1.4896, "step": 1244 }, { "epoch": 0.7083926031294452, "grad_norm": 0.5907067656517029, "learning_rate": 5.5315044107925094e-06, "loss": 1.5258, "step": 1245 }, { "epoch": 0.7089615931721195, "grad_norm": 0.5856897234916687, "learning_rate": 5.457191617388957e-06, "loss": 1.3751, "step": 1246 }, { "epoch": 0.7095305832147938, "grad_norm": 0.5863030552864075, "learning_rate": 5.383367377307857e-06, "loss": 1.2607, "step": 1247 }, { "epoch": 0.710099573257468, "grad_norm": 0.5891332626342773, "learning_rate": 5.310032072038651e-06, "loss": 1.3852, "step": 1248 }, { "epoch": 0.7106685633001423, "grad_norm": 0.5775113701820374, "learning_rate": 5.237186080544098e-06, "loss": 1.5867, "step": 1249 }, { "epoch": 0.7112375533428165, "grad_norm": 0.5843526721000671, "learning_rate": 5.164829779258451e-06, "loss": 1.5694, "step": 1250 }, { "epoch": 0.7118065433854908, "grad_norm": 0.7019409537315369, "learning_rate": 5.092963542085483e-06, "loss": 1.4444, "step": 1251 }, { "epoch": 0.712375533428165, "grad_norm": 0.6279569864273071, "learning_rate": 5.021587740396505e-06, "loss": 1.5798, "step": 1252 }, { "epoch": 0.7129445234708393, "grad_norm": 0.6179226040840149, "learning_rate": 4.950702743028535e-06, "loss": 1.3976, "step": 1253 }, { "epoch": 0.7135135135135136, "grad_norm": 0.5874016284942627, "learning_rate": 4.880308916282305e-06, "loss": 1.5384, "step": 1254 }, { "epoch": 0.7140825035561877, "grad_norm": 0.5691651701927185, "learning_rate": 4.810406623920427e-06, "loss": 1.3594, "step": 1255 }, { "epoch": 0.714651493598862, "grad_norm": 0.5660080909729004, "learning_rate": 4.740996227165462e-06, "loss": 1.5635, "step": 1256 }, { "epoch": 0.7152204836415362, "grad_norm": 0.6053207516670227, "learning_rate": 4.672078084698095e-06, "loss": 1.5981, "step": 1257 }, { "epoch": 0.7157894736842105, "grad_norm": 0.5976085066795349, "learning_rate": 4.603652552655302e-06, "loss": 1.5909, "step": 1258 }, { "epoch": 0.7163584637268848, "grad_norm": 0.547666609287262, "learning_rate": 4.53571998462845e-06, "loss": 1.3622, "step": 1259 }, { "epoch": 0.716927453769559, "grad_norm": 0.6441154479980469, "learning_rate": 4.468280731661489e-06, "loss": 1.4626, "step": 1260 }, { "epoch": 0.7174964438122333, "grad_norm": 0.6044400930404663, "learning_rate": 4.4013351422491635e-06, "loss": 1.5432, "step": 1261 }, { "epoch": 0.7180654338549075, "grad_norm": 0.5658133029937744, "learning_rate": 4.334883562335157e-06, "loss": 1.4528, "step": 1262 }, { "epoch": 0.7186344238975818, "grad_norm": 0.6291137933731079, "learning_rate": 4.268926335310408e-06, "loss": 1.2975, "step": 1263 }, { "epoch": 0.719203413940256, "grad_norm": 0.5724123120307922, "learning_rate": 4.20346380201122e-06, "loss": 1.223, "step": 1264 }, { "epoch": 0.7197724039829303, "grad_norm": 0.6324455142021179, "learning_rate": 4.138496300717565e-06, "loss": 1.3516, "step": 1265 }, { "epoch": 0.7203413940256046, "grad_norm": 0.5916242599487305, "learning_rate": 4.0740241671513025e-06, "loss": 1.6546, "step": 1266 }, { "epoch": 0.7209103840682788, "grad_norm": 0.5780736804008484, "learning_rate": 4.010047734474454e-06, "loss": 1.4467, "step": 1267 }, { "epoch": 0.7214793741109531, "grad_norm": 0.580437958240509, "learning_rate": 3.946567333287566e-06, "loss": 1.2151, "step": 1268 }, { "epoch": 0.7220483641536273, "grad_norm": 0.631999135017395, "learning_rate": 3.883583291627823e-06, "loss": 1.6731, "step": 1269 }, { "epoch": 0.7226173541963016, "grad_norm": 0.5912725329399109, "learning_rate": 3.821095934967511e-06, "loss": 1.5419, "step": 1270 }, { "epoch": 0.7231863442389758, "grad_norm": 0.5841814279556274, "learning_rate": 3.759105586212275e-06, "loss": 1.36, "step": 1271 }, { "epoch": 0.7237553342816501, "grad_norm": 0.620486319065094, "learning_rate": 3.6976125656994376e-06, "loss": 1.4474, "step": 1272 }, { "epoch": 0.7243243243243244, "grad_norm": 0.5620819330215454, "learning_rate": 3.6366171911963455e-06, "loss": 1.3565, "step": 1273 }, { "epoch": 0.7248933143669986, "grad_norm": 0.6318161487579346, "learning_rate": 3.576119777898812e-06, "loss": 1.5721, "step": 1274 }, { "epoch": 0.7254623044096729, "grad_norm": 0.5643869638442993, "learning_rate": 3.516120638429332e-06, "loss": 1.3681, "step": 1275 }, { "epoch": 0.726031294452347, "grad_norm": 0.5829715132713318, "learning_rate": 3.4566200828356157e-06, "loss": 1.3699, "step": 1276 }, { "epoch": 0.7266002844950213, "grad_norm": 0.5623791813850403, "learning_rate": 3.397618418588877e-06, "loss": 1.3686, "step": 1277 }, { "epoch": 0.7271692745376956, "grad_norm": 0.5907699465751648, "learning_rate": 3.3391159505823165e-06, "loss": 1.5019, "step": 1278 }, { "epoch": 0.7277382645803698, "grad_norm": 0.5887323617935181, "learning_rate": 3.2811129811295416e-06, "loss": 1.5161, "step": 1279 }, { "epoch": 0.7283072546230441, "grad_norm": 0.6375420093536377, "learning_rate": 3.2236098099629353e-06, "loss": 1.53, "step": 1280 }, { "epoch": 0.7288762446657183, "grad_norm": 0.569848358631134, "learning_rate": 3.16660673423218e-06, "loss": 1.5462, "step": 1281 }, { "epoch": 0.7294452347083926, "grad_norm": 0.5773903727531433, "learning_rate": 3.1101040485027043e-06, "loss": 1.4332, "step": 1282 }, { "epoch": 0.7300142247510668, "grad_norm": 0.5759513974189758, "learning_rate": 3.0541020447541256e-06, "loss": 1.4906, "step": 1283 }, { "epoch": 0.7305832147937411, "grad_norm": 0.5894652009010315, "learning_rate": 2.99860101237881e-06, "loss": 1.3007, "step": 1284 }, { "epoch": 0.7311522048364154, "grad_norm": 0.5720746517181396, "learning_rate": 2.9436012381803156e-06, "loss": 1.5254, "step": 1285 }, { "epoch": 0.7317211948790896, "grad_norm": 0.6133726239204407, "learning_rate": 2.8891030063719183e-06, "loss": 1.6029, "step": 1286 }, { "epoch": 0.7322901849217639, "grad_norm": 0.6293920874595642, "learning_rate": 2.8351065985751766e-06, "loss": 1.5918, "step": 1287 }, { "epoch": 0.7328591749644381, "grad_norm": 0.5941974520683289, "learning_rate": 2.7816122938184255e-06, "loss": 1.43, "step": 1288 }, { "epoch": 0.7334281650071124, "grad_norm": 0.5790094137191772, "learning_rate": 2.7286203685354063e-06, "loss": 1.4635, "step": 1289 }, { "epoch": 0.7339971550497866, "grad_norm": 0.593591570854187, "learning_rate": 2.6761310965637833e-06, "loss": 1.554, "step": 1290 }, { "epoch": 0.7345661450924609, "grad_norm": 0.6287367939949036, "learning_rate": 2.62414474914372e-06, "loss": 1.2736, "step": 1291 }, { "epoch": 0.7351351351351352, "grad_norm": 0.586243748664856, "learning_rate": 2.5726615949165254e-06, "loss": 1.6281, "step": 1292 }, { "epoch": 0.7357041251778094, "grad_norm": 0.6094790697097778, "learning_rate": 2.5216818999232117e-06, "loss": 1.4495, "step": 1293 }, { "epoch": 0.7362731152204837, "grad_norm": 0.5789740681648254, "learning_rate": 2.4712059276031816e-06, "loss": 1.6063, "step": 1294 }, { "epoch": 0.7368421052631579, "grad_norm": 0.5999897122383118, "learning_rate": 2.421233938792811e-06, "loss": 1.3805, "step": 1295 }, { "epoch": 0.7374110953058322, "grad_norm": 0.5815314054489136, "learning_rate": 2.3717661917241117e-06, "loss": 1.4289, "step": 1296 }, { "epoch": 0.7379800853485065, "grad_norm": 0.5862295031547546, "learning_rate": 2.322802942023461e-06, "loss": 1.4672, "step": 1297 }, { "epoch": 0.7385490753911806, "grad_norm": 0.5870852470397949, "learning_rate": 2.2743444427101525e-06, "loss": 1.5368, "step": 1298 }, { "epoch": 0.7391180654338549, "grad_norm": 0.5981742143630981, "learning_rate": 2.2263909441952226e-06, "loss": 1.4996, "step": 1299 }, { "epoch": 0.7396870554765291, "grad_norm": 0.6381643414497375, "learning_rate": 2.178942694280095e-06, "loss": 1.4773, "step": 1300 }, { "epoch": 0.7402560455192034, "grad_norm": 0.5861015915870667, "learning_rate": 2.1319999381552604e-06, "loss": 1.3885, "step": 1301 }, { "epoch": 0.7408250355618776, "grad_norm": 0.5836819410324097, "learning_rate": 2.0855629183990867e-06, "loss": 1.4594, "step": 1302 }, { "epoch": 0.7413940256045519, "grad_norm": 0.57367342710495, "learning_rate": 2.039631874976533e-06, "loss": 1.5536, "step": 1303 }, { "epoch": 0.7419630156472262, "grad_norm": 0.6635316014289856, "learning_rate": 1.9942070452378836e-06, "loss": 1.3837, "step": 1304 }, { "epoch": 0.7425320056899004, "grad_norm": 0.6087902784347534, "learning_rate": 1.9492886639175922e-06, "loss": 1.5232, "step": 1305 }, { "epoch": 0.7431009957325747, "grad_norm": 0.5868977308273315, "learning_rate": 1.9048769631329399e-06, "loss": 1.5394, "step": 1306 }, { "epoch": 0.7436699857752489, "grad_norm": 0.5506064891815186, "learning_rate": 1.8609721723830132e-06, "loss": 1.3222, "step": 1307 }, { "epoch": 0.7442389758179232, "grad_norm": 0.6256322860717773, "learning_rate": 1.8175745185473714e-06, "loss": 1.6126, "step": 1308 }, { "epoch": 0.7448079658605974, "grad_norm": 0.6551912426948547, "learning_rate": 1.774684225884904e-06, "loss": 1.6678, "step": 1309 }, { "epoch": 0.7453769559032717, "grad_norm": 0.6695655584335327, "learning_rate": 1.7323015160327638e-06, "loss": 1.4653, "step": 1310 }, { "epoch": 0.745945945945946, "grad_norm": 0.5701092481613159, "learning_rate": 1.690426608005069e-06, "loss": 1.4619, "step": 1311 }, { "epoch": 0.7465149359886202, "grad_norm": 0.5779337882995605, "learning_rate": 1.6490597181919254e-06, "loss": 1.3819, "step": 1312 }, { "epoch": 0.7470839260312945, "grad_norm": 0.6129727959632874, "learning_rate": 1.6082010603582053e-06, "loss": 1.5916, "step": 1313 }, { "epoch": 0.7476529160739687, "grad_norm": 0.627040445804596, "learning_rate": 1.567850845642449e-06, "loss": 1.4437, "step": 1314 }, { "epoch": 0.748221906116643, "grad_norm": 0.5893445611000061, "learning_rate": 1.5280092825558645e-06, "loss": 1.4348, "step": 1315 }, { "epoch": 0.7487908961593173, "grad_norm": 0.5637263059616089, "learning_rate": 1.4886765769811072e-06, "loss": 1.4235, "step": 1316 }, { "epoch": 0.7493598862019915, "grad_norm": 0.5983802080154419, "learning_rate": 1.4498529321713584e-06, "loss": 1.5322, "step": 1317 }, { "epoch": 0.7499288762446658, "grad_norm": 0.6314471364021301, "learning_rate": 1.4115385487491583e-06, "loss": 1.511, "step": 1318 }, { "epoch": 0.7504978662873399, "grad_norm": 0.6288767457008362, "learning_rate": 1.3737336247054644e-06, "loss": 1.5245, "step": 1319 }, { "epoch": 0.7510668563300142, "grad_norm": 0.5633329153060913, "learning_rate": 1.3364383553985726e-06, "loss": 1.5002, "step": 1320 }, { "epoch": 0.7516358463726884, "grad_norm": 0.5696760416030884, "learning_rate": 1.2996529335530749e-06, "loss": 1.4833, "step": 1321 }, { "epoch": 0.7522048364153627, "grad_norm": 0.6070805191993713, "learning_rate": 1.2633775492589816e-06, "loss": 1.3631, "step": 1322 }, { "epoch": 0.752773826458037, "grad_norm": 0.6544724702835083, "learning_rate": 1.2276123899706227e-06, "loss": 1.3451, "step": 1323 }, { "epoch": 0.7533428165007112, "grad_norm": 0.5900003910064697, "learning_rate": 1.1923576405057258e-06, "loss": 1.4344, "step": 1324 }, { "epoch": 0.7539118065433855, "grad_norm": 0.5852407217025757, "learning_rate": 1.1576134830444619e-06, "loss": 1.3403, "step": 1325 }, { "epoch": 0.7544807965860597, "grad_norm": 0.557827353477478, "learning_rate": 1.1233800971285013e-06, "loss": 1.2692, "step": 1326 }, { "epoch": 0.755049786628734, "grad_norm": 0.5728473663330078, "learning_rate": 1.0896576596600705e-06, "loss": 1.2999, "step": 1327 }, { "epoch": 0.7556187766714082, "grad_norm": 0.6241177320480347, "learning_rate": 1.0564463449010852e-06, "loss": 1.3683, "step": 1328 }, { "epoch": 0.7561877667140825, "grad_norm": 0.5648365020751953, "learning_rate": 1.0237463244721747e-06, "loss": 1.4297, "step": 1329 }, { "epoch": 0.7567567567567568, "grad_norm": 0.590801477432251, "learning_rate": 9.915577673518695e-07, "loss": 1.5818, "step": 1330 }, { "epoch": 0.757325746799431, "grad_norm": 0.6130539178848267, "learning_rate": 9.59880839875682e-07, "loss": 1.2326, "step": 1331 }, { "epoch": 0.7578947368421053, "grad_norm": 0.5840870141983032, "learning_rate": 9.287157057352502e-07, "loss": 1.4967, "step": 1332 }, { "epoch": 0.7584637268847795, "grad_norm": 0.5912277102470398, "learning_rate": 8.980625259775277e-07, "loss": 1.5008, "step": 1333 }, { "epoch": 0.7590327169274538, "grad_norm": 0.5769196152687073, "learning_rate": 8.679214590039064e-07, "loss": 1.29, "step": 1334 }, { "epoch": 0.7596017069701281, "grad_norm": 0.6438156366348267, "learning_rate": 8.382926605694064e-07, "loss": 1.4775, "step": 1335 }, { "epoch": 0.7601706970128023, "grad_norm": 0.5996107459068298, "learning_rate": 8.091762837819094e-07, "loss": 1.5507, "step": 1336 }, { "epoch": 0.7607396870554766, "grad_norm": 0.6058631539344788, "learning_rate": 7.80572479101327e-07, "loss": 1.4726, "step": 1337 }, { "epoch": 0.7613086770981508, "grad_norm": 0.5997673273086548, "learning_rate": 7.524813943388331e-07, "loss": 1.5108, "step": 1338 }, { "epoch": 0.761877667140825, "grad_norm": 0.5843853950500488, "learning_rate": 7.249031746561108e-07, "loss": 1.4569, "step": 1339 }, { "epoch": 0.7624466571834992, "grad_norm": 0.5930002927780151, "learning_rate": 6.978379625645959e-07, "loss": 1.4858, "step": 1340 }, { "epoch": 0.7630156472261735, "grad_norm": 0.5973467826843262, "learning_rate": 6.712858979247116e-07, "loss": 1.5819, "step": 1341 }, { "epoch": 0.7635846372688478, "grad_norm": 0.601449191570282, "learning_rate": 6.452471179452135e-07, "loss": 1.6227, "step": 1342 }, { "epoch": 0.764153627311522, "grad_norm": 0.5814241170883179, "learning_rate": 6.197217571824232e-07, "loss": 1.3806, "step": 1343 }, { "epoch": 0.7647226173541963, "grad_norm": 0.5642287731170654, "learning_rate": 5.947099475395402e-07, "loss": 1.1583, "step": 1344 }, { "epoch": 0.7652916073968705, "grad_norm": 0.5667275190353394, "learning_rate": 5.702118182659866e-07, "loss": 1.5422, "step": 1345 }, { "epoch": 0.7658605974395448, "grad_norm": 0.5716063976287842, "learning_rate": 5.462274959567193e-07, "loss": 1.4454, "step": 1346 }, { "epoch": 0.766429587482219, "grad_norm": 0.5906545519828796, "learning_rate": 5.227571045515633e-07, "loss": 1.4336, "step": 1347 }, { "epoch": 0.7669985775248933, "grad_norm": 0.5766403079032898, "learning_rate": 4.998007653346126e-07, "loss": 1.3452, "step": 1348 }, { "epoch": 0.7675675675675676, "grad_norm": 0.5666573643684387, "learning_rate": 4.773585969335636e-07, "loss": 1.4239, "step": 1349 }, { "epoch": 0.7681365576102418, "grad_norm": 0.586741030216217, "learning_rate": 4.554307153191273e-07, "loss": 1.4837, "step": 1350 }, { "epoch": 0.7687055476529161, "grad_norm": 0.5627419948577881, "learning_rate": 4.340172338043846e-07, "loss": 1.1588, "step": 1351 }, { "epoch": 0.7692745376955903, "grad_norm": 0.6001150608062744, "learning_rate": 4.131182630442876e-07, "loss": 1.6122, "step": 1352 }, { "epoch": 0.7698435277382646, "grad_norm": 0.6114206314086914, "learning_rate": 3.9273391103499257e-07, "loss": 1.493, "step": 1353 }, { "epoch": 0.7704125177809389, "grad_norm": 0.5789377689361572, "learning_rate": 3.728642831133833e-07, "loss": 1.4742, "step": 1354 }, { "epoch": 0.7709815078236131, "grad_norm": 0.6265093684196472, "learning_rate": 3.5350948195645993e-07, "loss": 1.4284, "step": 1355 }, { "epoch": 0.7715504978662874, "grad_norm": 0.6009383201599121, "learning_rate": 3.3466960758082867e-07, "loss": 1.3373, "step": 1356 }, { "epoch": 0.7721194879089616, "grad_norm": 0.5908554196357727, "learning_rate": 3.163447573422351e-07, "loss": 1.4147, "step": 1357 }, { "epoch": 0.7726884779516359, "grad_norm": 0.614896297454834, "learning_rate": 2.985350259349762e-07, "loss": 1.3802, "step": 1358 }, { "epoch": 0.77325746799431, "grad_norm": 0.6063140630722046, "learning_rate": 2.812405053914891e-07, "loss": 1.66, "step": 1359 }, { "epoch": 0.7738264580369844, "grad_norm": 0.5757827758789062, "learning_rate": 2.644612850818073e-07, "loss": 1.4361, "step": 1360 }, { "epoch": 0.7743954480796587, "grad_norm": 0.5920888185501099, "learning_rate": 2.481974517131502e-07, "loss": 1.3681, "step": 1361 }, { "epoch": 0.7749644381223328, "grad_norm": 0.5669330358505249, "learning_rate": 2.324490893294673e-07, "loss": 1.5391, "step": 1362 }, { "epoch": 0.7755334281650071, "grad_norm": 0.5745005011558533, "learning_rate": 2.172162793109611e-07, "loss": 1.3118, "step": 1363 }, { "epoch": 0.7761024182076813, "grad_norm": 0.5584404468536377, "learning_rate": 2.0249910037374308e-07, "loss": 1.4001, "step": 1364 }, { "epoch": 0.7766714082503556, "grad_norm": 0.6618481874465942, "learning_rate": 1.8829762856933387e-07, "loss": 1.4373, "step": 1365 }, { "epoch": 0.7772403982930298, "grad_norm": 0.5788954496383667, "learning_rate": 1.7461193728436353e-07, "loss": 1.5453, "step": 1366 }, { "epoch": 0.7778093883357041, "grad_norm": 0.6013731956481934, "learning_rate": 1.614420972401165e-07, "loss": 1.5133, "step": 1367 }, { "epoch": 0.7783783783783784, "grad_norm": 0.622704029083252, "learning_rate": 1.4878817649220944e-07, "loss": 1.4024, "step": 1368 }, { "epoch": 0.7789473684210526, "grad_norm": 0.5754665732383728, "learning_rate": 1.36650240430225e-07, "loss": 1.2369, "step": 1369 }, { "epoch": 0.7795163584637269, "grad_norm": 0.5812003016471863, "learning_rate": 1.250283517774009e-07, "loss": 1.5473, "step": 1370 }, { "epoch": 0.7800853485064011, "grad_norm": 0.6443690657615662, "learning_rate": 1.1392257059023026e-07, "loss": 1.5182, "step": 1371 }, { "epoch": 0.7806543385490754, "grad_norm": 0.5886285901069641, "learning_rate": 1.0333295425825063e-07, "loss": 1.6711, "step": 1372 }, { "epoch": 0.7812233285917497, "grad_norm": 0.5866546034812927, "learning_rate": 9.325955750367766e-08, "loss": 1.326, "step": 1373 }, { "epoch": 0.7817923186344239, "grad_norm": 0.5975569486618042, "learning_rate": 8.370243238113862e-08, "loss": 1.522, "step": 1374 }, { "epoch": 0.7823613086770982, "grad_norm": 0.5648385286331177, "learning_rate": 7.466162827742817e-08, "loss": 1.4345, "step": 1375 }, { "epoch": 0.7829302987197724, "grad_norm": 0.6075783371925354, "learning_rate": 6.61371919112197e-08, "loss": 1.4661, "step": 1376 }, { "epoch": 0.7834992887624467, "grad_norm": 0.6267833113670349, "learning_rate": 5.812916733284324e-08, "loss": 1.5151, "step": 1377 }, { "epoch": 0.7840682788051209, "grad_norm": 0.5820707082748413, "learning_rate": 5.063759592404127e-08, "loss": 1.4866, "step": 1378 }, { "epoch": 0.7846372688477952, "grad_norm": 0.5778554677963257, "learning_rate": 4.366251639777996e-08, "loss": 1.3435, "step": 1379 }, { "epoch": 0.7852062588904695, "grad_norm": 0.5690316557884216, "learning_rate": 3.720396479803823e-08, "loss": 1.4896, "step": 1380 }, { "epoch": 0.7857752489331437, "grad_norm": 0.5569249391555786, "learning_rate": 3.126197449959678e-08, "loss": 1.358, "step": 1381 }, { "epoch": 0.786344238975818, "grad_norm": 0.6170508861541748, "learning_rate": 2.5836576207916018e-08, "loss": 1.4965, "step": 1382 }, { "epoch": 0.7869132290184921, "grad_norm": 0.5334784388542175, "learning_rate": 2.092779795892508e-08, "loss": 1.2593, "step": 1383 }, { "epoch": 0.7874822190611664, "grad_norm": 0.6333361864089966, "learning_rate": 1.6535665118910802e-08, "loss": 1.3485, "step": 1384 }, { "epoch": 0.7880512091038406, "grad_norm": 0.6247609853744507, "learning_rate": 1.2660200384384536e-08, "loss": 1.4111, "step": 1385 }, { "epoch": 0.7886201991465149, "grad_norm": 0.5764187574386597, "learning_rate": 9.301423781926666e-09, "loss": 1.201, "step": 1386 }, { "epoch": 0.7891891891891892, "grad_norm": 0.5923343896865845, "learning_rate": 6.459352668164442e-09, "loss": 1.5227, "step": 1387 }, { "epoch": 0.7897581792318634, "grad_norm": 0.578344464302063, "learning_rate": 4.134001729583226e-09, "loss": 1.3055, "step": 1388 }, { "epoch": 0.7903271692745377, "grad_norm": 0.595220148563385, "learning_rate": 2.3253829825153894e-09, "loss": 1.5065, "step": 1389 }, { "epoch": 0.7908961593172119, "grad_norm": 0.5846388339996338, "learning_rate": 1.033505773062604e-09, "loss": 1.5331, "step": 1390 }, { "epoch": 0.7914651493598862, "grad_norm": 0.5945976376533508, "learning_rate": 2.5837677706253003e-10, "loss": 1.5147, "step": 1391 }, { "epoch": 0.7920341394025604, "grad_norm": 0.6109771728515625, "learning_rate": 0.0, "loss": 1.508, "step": 1392 }, { "epoch": 0.7920341394025604, "eval_loss": 1.461509108543396, "eval_runtime": 16.4544, "eval_samples_per_second": 44.973, "eval_steps_per_second": 22.486, "step": 1392 } ], "logging_steps": 1, "max_steps": 1392, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 348, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.474815762936627e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }