{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9988751406074241, "eval_steps": 500, "global_step": 777, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0012855535915153463, "grad_norm": 0.020836442708969116, "learning_rate": 0.0, "loss": 0.6262, "step": 1 }, { "epoch": 0.0025711071830306926, "grad_norm": 0.022134091705083847, "learning_rate": 4.075900941810124e-06, "loss": 0.8688, "step": 2 }, { "epoch": 0.003856660774546039, "grad_norm": 0.023200005292892456, "learning_rate": 6.46015014942309e-06, "loss": 0.7864, "step": 3 }, { "epoch": 0.005142214366061385, "grad_norm": 0.02313530258834362, "learning_rate": 8.151801883620247e-06, "loss": 0.8897, "step": 4 }, { "epoch": 0.0064277679575767315, "grad_norm": 0.020561356097459793, "learning_rate": 9.463948908766788e-06, "loss": 0.6479, "step": 5 }, { "epoch": 0.007713321549092078, "grad_norm": 0.021870166063308716, "learning_rate": 1.0536051091233212e-05, "loss": 0.7501, "step": 6 }, { "epoch": 0.008998875140607425, "grad_norm": 0.023460067808628082, "learning_rate": 1.1442500570809876e-05, "loss": 0.8672, "step": 7 }, { "epoch": 0.01028442873212277, "grad_norm": 0.02368471957743168, "learning_rate": 1.222770282543037e-05, "loss": 0.8984, "step": 8 }, { "epoch": 0.011569982323638118, "grad_norm": 0.020765064284205437, "learning_rate": 1.292030029884618e-05, "loss": 0.6547, "step": 9 }, { "epoch": 0.012855535915153463, "grad_norm": 0.023947741836309433, "learning_rate": 1.3539849850576912e-05, "loss": 0.8205, "step": 10 }, { "epoch": 0.01414108950666881, "grad_norm": 0.028013406321406364, "learning_rate": 1.4100300592531481e-05, "loss": 0.7891, "step": 11 }, { "epoch": 0.015426643098184156, "grad_norm": 0.027111703529953957, "learning_rate": 1.4611952033043337e-05, "loss": 0.7678, "step": 12 }, { "epoch": 0.0167121966896995, "grad_norm": 0.028518904000520706, "learning_rate": 1.5082625732282867e-05, "loss": 0.8091, "step": 13 }, { "epoch": 0.01799775028121485, "grad_norm": 0.022600186988711357, "learning_rate": 1.551840151262e-05, "loss": 0.5875, "step": 14 }, { "epoch": 0.019283303872730195, "grad_norm": 0.021580247208476067, "learning_rate": 1.5924099058189875e-05, "loss": 0.6714, "step": 15 }, { "epoch": 0.02056885746424554, "grad_norm": 0.02505405619740486, "learning_rate": 1.6303603767240495e-05, "loss": 0.7534, "step": 16 }, { "epoch": 0.021854411055760886, "grad_norm": 0.024437466636300087, "learning_rate": 1.6660093644266146e-05, "loss": 0.6945, "step": 17 }, { "epoch": 0.023139964647276235, "grad_norm": 0.028052283450961113, "learning_rate": 1.6996201240656302e-05, "loss": 0.7076, "step": 18 }, { "epoch": 0.02442551823879158, "grad_norm": 0.03632762283086777, "learning_rate": 1.7314131752785847e-05, "loss": 0.769, "step": 19 }, { "epoch": 0.025711071830306926, "grad_norm": 0.02896072156727314, "learning_rate": 1.7615750792387035e-05, "loss": 0.8087, "step": 20 }, { "epoch": 0.02699662542182227, "grad_norm": 0.034198954701423645, "learning_rate": 1.7902650720232966e-05, "loss": 0.8161, "step": 21 }, { "epoch": 0.02828217901333762, "grad_norm": 0.03110469877719879, "learning_rate": 1.8176201534341607e-05, "loss": 0.8253, "step": 22 }, { "epoch": 0.029567732604852966, "grad_norm": 0.039295781403779984, "learning_rate": 1.8437590437029225e-05, "loss": 0.9744, "step": 23 }, { "epoch": 0.03085328619636831, "grad_norm": 0.03249296918511391, "learning_rate": 1.868785297485346e-05, "loss": 0.6455, "step": 24 }, { "epoch": 0.032138839787883657, "grad_norm": 0.03106599487364292, "learning_rate": 1.8927897817533575e-05, "loss": 0.7005, "step": 25 }, { "epoch": 0.033424393379399, "grad_norm": 0.03536655381321907, "learning_rate": 1.915852667409299e-05, "loss": 0.8004, "step": 26 }, { "epoch": 0.03470994697091435, "grad_norm": 0.035472676157951355, "learning_rate": 1.9380450448269272e-05, "loss": 0.675, "step": 27 }, { "epoch": 0.0359955005624297, "grad_norm": 0.03877939283847809, "learning_rate": 1.9594302454430122e-05, "loss": 0.6278, "step": 28 }, { "epoch": 0.037281054153945045, "grad_norm": 0.041341230273246765, "learning_rate": 1.9800649313336155e-05, "loss": 0.914, "step": 29 }, { "epoch": 0.03856660774546039, "grad_norm": 0.042063791304826736, "learning_rate": 2e-05, "loss": 0.7664, "step": 30 }, { "epoch": 0.039852161336975736, "grad_norm": 0.04166961461305618, "learning_rate": 2e-05, "loss": 0.6324, "step": 31 }, { "epoch": 0.04113771492849108, "grad_norm": 0.04256080463528633, "learning_rate": 2e-05, "loss": 0.6771, "step": 32 }, { "epoch": 0.04242326852000643, "grad_norm": 0.042959265410900116, "learning_rate": 2e-05, "loss": 0.6018, "step": 33 }, { "epoch": 0.04370882211152177, "grad_norm": 0.03880544751882553, "learning_rate": 2e-05, "loss": 0.5819, "step": 34 }, { "epoch": 0.04499437570303712, "grad_norm": 0.0412827730178833, "learning_rate": 2e-05, "loss": 0.8133, "step": 35 }, { "epoch": 0.04627992929455247, "grad_norm": 0.04274650663137436, "learning_rate": 2e-05, "loss": 0.6237, "step": 36 }, { "epoch": 0.047565482886067816, "grad_norm": 0.04136871546506882, "learning_rate": 2e-05, "loss": 0.5851, "step": 37 }, { "epoch": 0.04885103647758316, "grad_norm": 0.04220248758792877, "learning_rate": 2e-05, "loss": 0.7603, "step": 38 }, { "epoch": 0.050136590069098506, "grad_norm": 0.039129678159952164, "learning_rate": 2e-05, "loss": 0.5206, "step": 39 }, { "epoch": 0.05142214366061385, "grad_norm": 0.04173429682850838, "learning_rate": 2e-05, "loss": 0.6602, "step": 40 }, { "epoch": 0.0527076972521292, "grad_norm": 0.040010105818510056, "learning_rate": 2e-05, "loss": 0.5964, "step": 41 }, { "epoch": 0.05399325084364454, "grad_norm": 0.03841459006071091, "learning_rate": 2e-05, "loss": 0.6162, "step": 42 }, { "epoch": 0.05527880443515989, "grad_norm": 0.04042840003967285, "learning_rate": 2e-05, "loss": 0.535, "step": 43 }, { "epoch": 0.05656435802667524, "grad_norm": 0.040401577949523926, "learning_rate": 2e-05, "loss": 0.5685, "step": 44 }, { "epoch": 0.057849911618190586, "grad_norm": 0.06742753833532333, "learning_rate": 2e-05, "loss": 0.8276, "step": 45 }, { "epoch": 0.05913546520970593, "grad_norm": 0.040345244109630585, "learning_rate": 2e-05, "loss": 0.5988, "step": 46 }, { "epoch": 0.06042101880122128, "grad_norm": 0.0415828563272953, "learning_rate": 2e-05, "loss": 0.6151, "step": 47 }, { "epoch": 0.06170657239273662, "grad_norm": 0.041223231703042984, "learning_rate": 2e-05, "loss": 0.638, "step": 48 }, { "epoch": 0.06299212598425197, "grad_norm": 0.03628067672252655, "learning_rate": 2e-05, "loss": 0.5031, "step": 49 }, { "epoch": 0.06427767957576731, "grad_norm": 0.04399935156106949, "learning_rate": 2e-05, "loss": 0.6615, "step": 50 }, { "epoch": 0.06556323316728266, "grad_norm": 0.04084352031350136, "learning_rate": 2e-05, "loss": 0.5703, "step": 51 }, { "epoch": 0.066848786758798, "grad_norm": 0.039231687784194946, "learning_rate": 2e-05, "loss": 0.5325, "step": 52 }, { "epoch": 0.06813434035031335, "grad_norm": 0.04078860580921173, "learning_rate": 2e-05, "loss": 0.5959, "step": 53 }, { "epoch": 0.0694198939418287, "grad_norm": 0.03753922879695892, "learning_rate": 2e-05, "loss": 0.5652, "step": 54 }, { "epoch": 0.07070544753334404, "grad_norm": 0.041337307542562485, "learning_rate": 2e-05, "loss": 0.5579, "step": 55 }, { "epoch": 0.0719910011248594, "grad_norm": 0.03940434008836746, "learning_rate": 2e-05, "loss": 0.4821, "step": 56 }, { "epoch": 0.07327655471637474, "grad_norm": 0.03760010376572609, "learning_rate": 2e-05, "loss": 0.4442, "step": 57 }, { "epoch": 0.07456210830789009, "grad_norm": 0.042540181428194046, "learning_rate": 2e-05, "loss": 0.5285, "step": 58 }, { "epoch": 0.07584766189940544, "grad_norm": 0.0457993820309639, "learning_rate": 2e-05, "loss": 0.7136, "step": 59 }, { "epoch": 0.07713321549092078, "grad_norm": 0.033564481884241104, "learning_rate": 2e-05, "loss": 0.4617, "step": 60 }, { "epoch": 0.07841876908243613, "grad_norm": 0.041546691209077835, "learning_rate": 2e-05, "loss": 0.6912, "step": 61 }, { "epoch": 0.07970432267395147, "grad_norm": 0.03729071840643883, "learning_rate": 2e-05, "loss": 0.4584, "step": 62 }, { "epoch": 0.08098987626546682, "grad_norm": 0.04159967973828316, "learning_rate": 2e-05, "loss": 0.5392, "step": 63 }, { "epoch": 0.08227542985698216, "grad_norm": 0.03827968239784241, "learning_rate": 2e-05, "loss": 0.5548, "step": 64 }, { "epoch": 0.08356098344849751, "grad_norm": 0.04405729100108147, "learning_rate": 2e-05, "loss": 0.6239, "step": 65 }, { "epoch": 0.08484653704001285, "grad_norm": 0.03460558503866196, "learning_rate": 2e-05, "loss": 0.4405, "step": 66 }, { "epoch": 0.0861320906315282, "grad_norm": 0.030664170160889626, "learning_rate": 2e-05, "loss": 0.3396, "step": 67 }, { "epoch": 0.08741764422304354, "grad_norm": 0.0376565083861351, "learning_rate": 2e-05, "loss": 0.5822, "step": 68 }, { "epoch": 0.08870319781455889, "grad_norm": 0.0384797677397728, "learning_rate": 2e-05, "loss": 0.5402, "step": 69 }, { "epoch": 0.08998875140607424, "grad_norm": 0.030342888087034225, "learning_rate": 2e-05, "loss": 0.371, "step": 70 }, { "epoch": 0.09127430499758958, "grad_norm": 0.0449620746076107, "learning_rate": 2e-05, "loss": 0.5723, "step": 71 }, { "epoch": 0.09255985858910494, "grad_norm": 0.03808669000864029, "learning_rate": 2e-05, "loss": 0.4842, "step": 72 }, { "epoch": 0.09384541218062029, "grad_norm": 0.03985065966844559, "learning_rate": 2e-05, "loss": 0.4957, "step": 73 }, { "epoch": 0.09513096577213563, "grad_norm": 0.030943365767598152, "learning_rate": 2e-05, "loss": 0.4595, "step": 74 }, { "epoch": 0.09641651936365098, "grad_norm": 0.03418966010212898, "learning_rate": 2e-05, "loss": 0.3757, "step": 75 }, { "epoch": 0.09770207295516632, "grad_norm": 0.033448606729507446, "learning_rate": 2e-05, "loss": 0.4325, "step": 76 }, { "epoch": 0.09898762654668167, "grad_norm": 0.039748664945364, "learning_rate": 2e-05, "loss": 0.507, "step": 77 }, { "epoch": 0.10027318013819701, "grad_norm": 0.04277816414833069, "learning_rate": 2e-05, "loss": 0.6296, "step": 78 }, { "epoch": 0.10155873372971236, "grad_norm": 0.029562752693891525, "learning_rate": 2e-05, "loss": 0.395, "step": 79 }, { "epoch": 0.1028442873212277, "grad_norm": 0.029590601101517677, "learning_rate": 2e-05, "loss": 0.3807, "step": 80 }, { "epoch": 0.10412984091274305, "grad_norm": 0.031173471361398697, "learning_rate": 2e-05, "loss": 0.4175, "step": 81 }, { "epoch": 0.1054153945042584, "grad_norm": 0.03821694105863571, "learning_rate": 2e-05, "loss": 0.5547, "step": 82 }, { "epoch": 0.10670094809577374, "grad_norm": 0.02932704985141754, "learning_rate": 2e-05, "loss": 0.3954, "step": 83 }, { "epoch": 0.10798650168728909, "grad_norm": 0.030441921204328537, "learning_rate": 2e-05, "loss": 0.4564, "step": 84 }, { "epoch": 0.10927205527880443, "grad_norm": 0.03350207954645157, "learning_rate": 2e-05, "loss": 0.4477, "step": 85 }, { "epoch": 0.11055760887031978, "grad_norm": 0.030435308814048767, "learning_rate": 2e-05, "loss": 0.4292, "step": 86 }, { "epoch": 0.11184316246183512, "grad_norm": 0.03452485054731369, "learning_rate": 2e-05, "loss": 0.4572, "step": 87 }, { "epoch": 0.11312871605335048, "grad_norm": 0.029849708080291748, "learning_rate": 2e-05, "loss": 0.3826, "step": 88 }, { "epoch": 0.11441426964486583, "grad_norm": 0.026589911431074142, "learning_rate": 2e-05, "loss": 0.3335, "step": 89 }, { "epoch": 0.11569982323638117, "grad_norm": 0.03767862543463707, "learning_rate": 2e-05, "loss": 0.5377, "step": 90 }, { "epoch": 0.11698537682789652, "grad_norm": 0.030503496527671814, "learning_rate": 2e-05, "loss": 0.4019, "step": 91 }, { "epoch": 0.11827093041941186, "grad_norm": 0.02843611314892769, "learning_rate": 2e-05, "loss": 0.378, "step": 92 }, { "epoch": 0.11955648401092721, "grad_norm": 0.02735988050699234, "learning_rate": 2e-05, "loss": 0.3842, "step": 93 }, { "epoch": 0.12084203760244255, "grad_norm": 0.03628378361463547, "learning_rate": 2e-05, "loss": 0.502, "step": 94 }, { "epoch": 0.1221275911939579, "grad_norm": 0.029980337247252464, "learning_rate": 2e-05, "loss": 0.4177, "step": 95 }, { "epoch": 0.12341314478547324, "grad_norm": 0.03486626222729683, "learning_rate": 2e-05, "loss": 0.495, "step": 96 }, { "epoch": 0.12469869837698859, "grad_norm": 0.03005075454711914, "learning_rate": 2e-05, "loss": 0.3618, "step": 97 }, { "epoch": 0.12598425196850394, "grad_norm": 0.03018985688686371, "learning_rate": 2e-05, "loss": 0.4078, "step": 98 }, { "epoch": 0.1272698055600193, "grad_norm": 0.03108677826821804, "learning_rate": 2e-05, "loss": 0.4583, "step": 99 }, { "epoch": 0.12855535915153463, "grad_norm": 0.029582438990473747, "learning_rate": 2e-05, "loss": 0.4142, "step": 100 }, { "epoch": 0.12984091274304999, "grad_norm": 0.02979620173573494, "learning_rate": 2e-05, "loss": 0.4535, "step": 101 }, { "epoch": 0.13112646633456532, "grad_norm": 0.032250065356492996, "learning_rate": 2e-05, "loss": 0.3805, "step": 102 }, { "epoch": 0.13241201992608068, "grad_norm": 0.03306899964809418, "learning_rate": 2e-05, "loss": 0.4351, "step": 103 }, { "epoch": 0.133697573517596, "grad_norm": 0.023130670189857483, "learning_rate": 2e-05, "loss": 0.2417, "step": 104 }, { "epoch": 0.13498312710911137, "grad_norm": 0.03372225537896156, "learning_rate": 2e-05, "loss": 0.4703, "step": 105 }, { "epoch": 0.1362686807006267, "grad_norm": 0.02907857671380043, "learning_rate": 2e-05, "loss": 0.3437, "step": 106 }, { "epoch": 0.13755423429214206, "grad_norm": 0.03021407686173916, "learning_rate": 2e-05, "loss": 0.4327, "step": 107 }, { "epoch": 0.1388397878836574, "grad_norm": 0.027038615196943283, "learning_rate": 2e-05, "loss": 0.3652, "step": 108 }, { "epoch": 0.14012534147517275, "grad_norm": 0.02982942759990692, "learning_rate": 2e-05, "loss": 0.345, "step": 109 }, { "epoch": 0.14141089506668808, "grad_norm": 0.0561259388923645, "learning_rate": 2e-05, "loss": 0.5073, "step": 110 }, { "epoch": 0.14269644865820344, "grad_norm": 0.024736687541007996, "learning_rate": 2e-05, "loss": 0.3149, "step": 111 }, { "epoch": 0.1439820022497188, "grad_norm": 0.02275976352393627, "learning_rate": 2e-05, "loss": 0.3147, "step": 112 }, { "epoch": 0.14526755584123413, "grad_norm": 0.030464742332696915, "learning_rate": 2e-05, "loss": 0.4512, "step": 113 }, { "epoch": 0.1465531094327495, "grad_norm": 0.026887530460953712, "learning_rate": 2e-05, "loss": 0.3679, "step": 114 }, { "epoch": 0.14783866302426482, "grad_norm": 0.03605503961443901, "learning_rate": 2e-05, "loss": 0.4392, "step": 115 }, { "epoch": 0.14912421661578018, "grad_norm": 0.02638978883624077, "learning_rate": 2e-05, "loss": 0.3484, "step": 116 }, { "epoch": 0.1504097702072955, "grad_norm": 0.03650350496172905, "learning_rate": 2e-05, "loss": 0.3978, "step": 117 }, { "epoch": 0.15169532379881087, "grad_norm": 0.022277837619185448, "learning_rate": 2e-05, "loss": 0.2525, "step": 118 }, { "epoch": 0.1529808773903262, "grad_norm": 0.021412434056401253, "learning_rate": 2e-05, "loss": 0.2922, "step": 119 }, { "epoch": 0.15426643098184156, "grad_norm": 0.029154105111956596, "learning_rate": 2e-05, "loss": 0.3864, "step": 120 }, { "epoch": 0.1555519845733569, "grad_norm": 0.024072440341114998, "learning_rate": 2e-05, "loss": 0.2467, "step": 121 }, { "epoch": 0.15683753816487225, "grad_norm": 0.019447140395641327, "learning_rate": 2e-05, "loss": 0.2086, "step": 122 }, { "epoch": 0.15812309175638758, "grad_norm": 0.035536400973796844, "learning_rate": 2e-05, "loss": 0.489, "step": 123 }, { "epoch": 0.15940864534790294, "grad_norm": 0.026226134970784187, "learning_rate": 2e-05, "loss": 0.3502, "step": 124 }, { "epoch": 0.16069419893941828, "grad_norm": 0.029284900054335594, "learning_rate": 2e-05, "loss": 0.3683, "step": 125 }, { "epoch": 0.16197975253093364, "grad_norm": 0.026484966278076172, "learning_rate": 2e-05, "loss": 0.3686, "step": 126 }, { "epoch": 0.16326530612244897, "grad_norm": 0.03296555206179619, "learning_rate": 2e-05, "loss": 0.4598, "step": 127 }, { "epoch": 0.16455085971396433, "grad_norm": 0.03217398375272751, "learning_rate": 2e-05, "loss": 0.4292, "step": 128 }, { "epoch": 0.16583641330547969, "grad_norm": 0.02639828808605671, "learning_rate": 2e-05, "loss": 0.324, "step": 129 }, { "epoch": 0.16712196689699502, "grad_norm": 0.025398138910531998, "learning_rate": 2e-05, "loss": 0.3565, "step": 130 }, { "epoch": 0.16840752048851038, "grad_norm": 0.026609797030687332, "learning_rate": 2e-05, "loss": 0.326, "step": 131 }, { "epoch": 0.1696930740800257, "grad_norm": 0.029938040301203728, "learning_rate": 2e-05, "loss": 0.4149, "step": 132 }, { "epoch": 0.17097862767154107, "grad_norm": 0.02608969807624817, "learning_rate": 2e-05, "loss": 0.3774, "step": 133 }, { "epoch": 0.1722641812630564, "grad_norm": 0.02580363303422928, "learning_rate": 2e-05, "loss": 0.2944, "step": 134 }, { "epoch": 0.17354973485457176, "grad_norm": 0.029851458966732025, "learning_rate": 2e-05, "loss": 0.3316, "step": 135 }, { "epoch": 0.1748352884460871, "grad_norm": 0.02928406558930874, "learning_rate": 2e-05, "loss": 0.3548, "step": 136 }, { "epoch": 0.17612084203760245, "grad_norm": 0.030875032767653465, "learning_rate": 2e-05, "loss": 0.3617, "step": 137 }, { "epoch": 0.17740639562911778, "grad_norm": 0.026721350848674774, "learning_rate": 2e-05, "loss": 0.3799, "step": 138 }, { "epoch": 0.17869194922063314, "grad_norm": 0.03269115090370178, "learning_rate": 2e-05, "loss": 0.4324, "step": 139 }, { "epoch": 0.17997750281214847, "grad_norm": 0.022154508158564568, "learning_rate": 2e-05, "loss": 0.2744, "step": 140 }, { "epoch": 0.18126305640366383, "grad_norm": 0.022251179441809654, "learning_rate": 2e-05, "loss": 0.2886, "step": 141 }, { "epoch": 0.18254860999517916, "grad_norm": 0.03386593237519264, "learning_rate": 2e-05, "loss": 0.473, "step": 142 }, { "epoch": 0.18383416358669452, "grad_norm": 0.02578306384384632, "learning_rate": 2e-05, "loss": 0.3224, "step": 143 }, { "epoch": 0.18511971717820988, "grad_norm": 0.027509864419698715, "learning_rate": 2e-05, "loss": 0.3224, "step": 144 }, { "epoch": 0.1864052707697252, "grad_norm": 0.02819378860294819, "learning_rate": 2e-05, "loss": 0.3176, "step": 145 }, { "epoch": 0.18769082436124057, "grad_norm": 0.028061147779226303, "learning_rate": 2e-05, "loss": 0.3494, "step": 146 }, { "epoch": 0.1889763779527559, "grad_norm": 0.032399386167526245, "learning_rate": 2e-05, "loss": 0.3647, "step": 147 }, { "epoch": 0.19026193154427126, "grad_norm": 0.028246790170669556, "learning_rate": 2e-05, "loss": 0.3366, "step": 148 }, { "epoch": 0.1915474851357866, "grad_norm": 0.03099609911441803, "learning_rate": 2e-05, "loss": 0.4034, "step": 149 }, { "epoch": 0.19283303872730195, "grad_norm": 0.03750993683934212, "learning_rate": 2e-05, "loss": 0.3395, "step": 150 }, { "epoch": 0.19411859231881728, "grad_norm": 0.0326780304312706, "learning_rate": 2e-05, "loss": 0.4482, "step": 151 }, { "epoch": 0.19540414591033264, "grad_norm": 0.033816393464803696, "learning_rate": 2e-05, "loss": 0.4504, "step": 152 }, { "epoch": 0.19668969950184798, "grad_norm": 0.026754887774586678, "learning_rate": 2e-05, "loss": 0.3334, "step": 153 }, { "epoch": 0.19797525309336333, "grad_norm": 0.02957574650645256, "learning_rate": 2e-05, "loss": 0.3698, "step": 154 }, { "epoch": 0.19926080668487867, "grad_norm": 0.02848845347762108, "learning_rate": 2e-05, "loss": 0.3001, "step": 155 }, { "epoch": 0.20054636027639403, "grad_norm": 0.03636415675282478, "learning_rate": 2e-05, "loss": 0.4872, "step": 156 }, { "epoch": 0.20183191386790936, "grad_norm": 0.018864037469029427, "learning_rate": 2e-05, "loss": 0.2086, "step": 157 }, { "epoch": 0.20311746745942472, "grad_norm": 0.027126725763082504, "learning_rate": 2e-05, "loss": 0.3181, "step": 158 }, { "epoch": 0.20440302105094005, "grad_norm": 0.025296056643128395, "learning_rate": 2e-05, "loss": 0.3169, "step": 159 }, { "epoch": 0.2056885746424554, "grad_norm": 0.035376112908124924, "learning_rate": 2e-05, "loss": 0.4219, "step": 160 }, { "epoch": 0.20697412823397077, "grad_norm": 0.030744420364499092, "learning_rate": 2e-05, "loss": 0.3818, "step": 161 }, { "epoch": 0.2082596818254861, "grad_norm": 0.03273791819810867, "learning_rate": 2e-05, "loss": 0.3823, "step": 162 }, { "epoch": 0.20954523541700146, "grad_norm": 0.030423806980252266, "learning_rate": 2e-05, "loss": 0.3451, "step": 163 }, { "epoch": 0.2108307890085168, "grad_norm": 0.029618561267852783, "learning_rate": 2e-05, "loss": 0.3604, "step": 164 }, { "epoch": 0.21211634260003215, "grad_norm": 0.030883729457855225, "learning_rate": 2e-05, "loss": 0.401, "step": 165 }, { "epoch": 0.21340189619154748, "grad_norm": 0.028922105208039284, "learning_rate": 2e-05, "loss": 0.3305, "step": 166 }, { "epoch": 0.21468744978306284, "grad_norm": 0.033665966242551804, "learning_rate": 2e-05, "loss": 0.3175, "step": 167 }, { "epoch": 0.21597300337457817, "grad_norm": 0.035460278391838074, "learning_rate": 2e-05, "loss": 0.4509, "step": 168 }, { "epoch": 0.21725855696609353, "grad_norm": 0.026533829048275948, "learning_rate": 2e-05, "loss": 0.2403, "step": 169 }, { "epoch": 0.21854411055760886, "grad_norm": 0.029200293123722076, "learning_rate": 2e-05, "loss": 0.2794, "step": 170 }, { "epoch": 0.21982966414912422, "grad_norm": 0.027879290282726288, "learning_rate": 2e-05, "loss": 0.2995, "step": 171 }, { "epoch": 0.22111521774063955, "grad_norm": 0.027549387887120247, "learning_rate": 2e-05, "loss": 0.2803, "step": 172 }, { "epoch": 0.2224007713321549, "grad_norm": 0.03113819658756256, "learning_rate": 2e-05, "loss": 0.2479, "step": 173 }, { "epoch": 0.22368632492367024, "grad_norm": 0.024273231625556946, "learning_rate": 2e-05, "loss": 0.2705, "step": 174 }, { "epoch": 0.2249718785151856, "grad_norm": 0.02970244735479355, "learning_rate": 2e-05, "loss": 0.3266, "step": 175 }, { "epoch": 0.22625743210670096, "grad_norm": 0.028792355209589005, "learning_rate": 2e-05, "loss": 0.3311, "step": 176 }, { "epoch": 0.2275429856982163, "grad_norm": 0.029121607542037964, "learning_rate": 2e-05, "loss": 0.2814, "step": 177 }, { "epoch": 0.22882853928973165, "grad_norm": 0.029099591076374054, "learning_rate": 2e-05, "loss": 0.3065, "step": 178 }, { "epoch": 0.23011409288124698, "grad_norm": 0.02833685837686062, "learning_rate": 2e-05, "loss": 0.3459, "step": 179 }, { "epoch": 0.23139964647276234, "grad_norm": 0.03676662966609001, "learning_rate": 2e-05, "loss": 0.4179, "step": 180 }, { "epoch": 0.23268520006427768, "grad_norm": 0.02846740558743477, "learning_rate": 2e-05, "loss": 0.2879, "step": 181 }, { "epoch": 0.23397075365579303, "grad_norm": 0.030531803146004677, "learning_rate": 2e-05, "loss": 0.362, "step": 182 }, { "epoch": 0.23525630724730837, "grad_norm": 0.034853462129831314, "learning_rate": 2e-05, "loss": 0.3814, "step": 183 }, { "epoch": 0.23654186083882373, "grad_norm": 0.03336189687252045, "learning_rate": 2e-05, "loss": 0.4272, "step": 184 }, { "epoch": 0.23782741443033906, "grad_norm": 0.03514046594500542, "learning_rate": 2e-05, "loss": 0.3432, "step": 185 }, { "epoch": 0.23911296802185442, "grad_norm": 0.032468028366565704, "learning_rate": 2e-05, "loss": 0.405, "step": 186 }, { "epoch": 0.24039852161336975, "grad_norm": 0.026813151314854622, "learning_rate": 2e-05, "loss": 0.2746, "step": 187 }, { "epoch": 0.2416840752048851, "grad_norm": 0.03329463675618172, "learning_rate": 2e-05, "loss": 0.3566, "step": 188 }, { "epoch": 0.24296962879640044, "grad_norm": 0.03253549337387085, "learning_rate": 2e-05, "loss": 0.3951, "step": 189 }, { "epoch": 0.2442551823879158, "grad_norm": 0.03337908163666725, "learning_rate": 2e-05, "loss": 0.4022, "step": 190 }, { "epoch": 0.24554073597943113, "grad_norm": 0.029503121972084045, "learning_rate": 2e-05, "loss": 0.3154, "step": 191 }, { "epoch": 0.2468262895709465, "grad_norm": 0.03800208494067192, "learning_rate": 2e-05, "loss": 0.3716, "step": 192 }, { "epoch": 0.24811184316246185, "grad_norm": 0.04471494257450104, "learning_rate": 2e-05, "loss": 0.3618, "step": 193 }, { "epoch": 0.24939739675397718, "grad_norm": 0.03158828616142273, "learning_rate": 2e-05, "loss": 0.3035, "step": 194 }, { "epoch": 0.25068295034549254, "grad_norm": 0.030343275517225266, "learning_rate": 2e-05, "loss": 0.3177, "step": 195 }, { "epoch": 0.25196850393700787, "grad_norm": 0.027333417907357216, "learning_rate": 2e-05, "loss": 0.2688, "step": 196 }, { "epoch": 0.2532540575285232, "grad_norm": 0.034231096506118774, "learning_rate": 2e-05, "loss": 0.2827, "step": 197 }, { "epoch": 0.2545396111200386, "grad_norm": 0.042767249047756195, "learning_rate": 2e-05, "loss": 0.3617, "step": 198 }, { "epoch": 0.2558251647115539, "grad_norm": 0.04363776370882988, "learning_rate": 2e-05, "loss": 0.428, "step": 199 }, { "epoch": 0.25711071830306925, "grad_norm": 0.03701059892773628, "learning_rate": 2e-05, "loss": 0.4258, "step": 200 }, { "epoch": 0.2583962718945846, "grad_norm": 0.03248538821935654, "learning_rate": 2e-05, "loss": 0.3127, "step": 201 }, { "epoch": 0.25968182548609997, "grad_norm": 0.02792442962527275, "learning_rate": 2e-05, "loss": 0.2616, "step": 202 }, { "epoch": 0.2609673790776153, "grad_norm": 0.02882961928844452, "learning_rate": 2e-05, "loss": 0.2822, "step": 203 }, { "epoch": 0.26225293266913063, "grad_norm": 0.02498476952314377, "learning_rate": 2e-05, "loss": 0.2291, "step": 204 }, { "epoch": 0.26353848626064597, "grad_norm": 0.0262466911226511, "learning_rate": 2e-05, "loss": 0.2084, "step": 205 }, { "epoch": 0.26482403985216135, "grad_norm": 0.031161930412054062, "learning_rate": 2e-05, "loss": 0.2977, "step": 206 }, { "epoch": 0.2661095934436767, "grad_norm": 0.03852604702115059, "learning_rate": 2e-05, "loss": 0.3606, "step": 207 }, { "epoch": 0.267395147035192, "grad_norm": 0.03641024976968765, "learning_rate": 2e-05, "loss": 0.3855, "step": 208 }, { "epoch": 0.2686807006267074, "grad_norm": 0.03774799406528473, "learning_rate": 2e-05, "loss": 0.3458, "step": 209 }, { "epoch": 0.26996625421822273, "grad_norm": 0.04067372530698776, "learning_rate": 2e-05, "loss": 0.4515, "step": 210 }, { "epoch": 0.27125180780973807, "grad_norm": 0.03964482620358467, "learning_rate": 2e-05, "loss": 0.4272, "step": 211 }, { "epoch": 0.2725373614012534, "grad_norm": 0.02894040197134018, "learning_rate": 2e-05, "loss": 0.256, "step": 212 }, { "epoch": 0.2738229149927688, "grad_norm": 0.036077771335840225, "learning_rate": 2e-05, "loss": 0.3755, "step": 213 }, { "epoch": 0.2751084685842841, "grad_norm": 0.032988108694553375, "learning_rate": 2e-05, "loss": 0.3135, "step": 214 }, { "epoch": 0.27639402217579945, "grad_norm": 0.02877802960574627, "learning_rate": 2e-05, "loss": 0.2762, "step": 215 }, { "epoch": 0.2776795757673148, "grad_norm": 0.03700711205601692, "learning_rate": 2e-05, "loss": 0.3022, "step": 216 }, { "epoch": 0.27896512935883017, "grad_norm": 0.03660174459218979, "learning_rate": 2e-05, "loss": 0.3265, "step": 217 }, { "epoch": 0.2802506829503455, "grad_norm": 0.034895338118076324, "learning_rate": 2e-05, "loss": 0.3337, "step": 218 }, { "epoch": 0.28153623654186083, "grad_norm": 0.029524167999625206, "learning_rate": 2e-05, "loss": 0.2872, "step": 219 }, { "epoch": 0.28282179013337616, "grad_norm": 0.037102892994880676, "learning_rate": 2e-05, "loss": 0.3484, "step": 220 }, { "epoch": 0.28410734372489155, "grad_norm": 0.02568918839097023, "learning_rate": 2e-05, "loss": 0.2352, "step": 221 }, { "epoch": 0.2853928973164069, "grad_norm": 0.03680694103240967, "learning_rate": 2e-05, "loss": 0.3156, "step": 222 }, { "epoch": 0.2866784509079222, "grad_norm": 0.03616785258054733, "learning_rate": 2e-05, "loss": 0.3435, "step": 223 }, { "epoch": 0.2879640044994376, "grad_norm": 0.03019794449210167, "learning_rate": 2e-05, "loss": 0.2342, "step": 224 }, { "epoch": 0.28924955809095293, "grad_norm": 0.029189620167016983, "learning_rate": 2e-05, "loss": 0.2622, "step": 225 }, { "epoch": 0.29053511168246826, "grad_norm": 0.03722851350903511, "learning_rate": 2e-05, "loss": 0.3245, "step": 226 }, { "epoch": 0.2918206652739836, "grad_norm": 0.028928019106388092, "learning_rate": 2e-05, "loss": 0.2444, "step": 227 }, { "epoch": 0.293106218865499, "grad_norm": 0.03965122997760773, "learning_rate": 2e-05, "loss": 0.2914, "step": 228 }, { "epoch": 0.2943917724570143, "grad_norm": 0.03618443012237549, "learning_rate": 2e-05, "loss": 0.2944, "step": 229 }, { "epoch": 0.29567732604852964, "grad_norm": 0.04255329445004463, "learning_rate": 2e-05, "loss": 0.3803, "step": 230 }, { "epoch": 0.296962879640045, "grad_norm": 0.03631114959716797, "learning_rate": 2e-05, "loss": 0.3529, "step": 231 }, { "epoch": 0.29824843323156036, "grad_norm": 0.0347764790058136, "learning_rate": 2e-05, "loss": 0.2967, "step": 232 }, { "epoch": 0.2995339868230757, "grad_norm": 0.03510100021958351, "learning_rate": 2e-05, "loss": 0.3316, "step": 233 }, { "epoch": 0.300819540414591, "grad_norm": 0.03378084674477577, "learning_rate": 2e-05, "loss": 0.3318, "step": 234 }, { "epoch": 0.30210509400610636, "grad_norm": 0.035719968378543854, "learning_rate": 2e-05, "loss": 0.2408, "step": 235 }, { "epoch": 0.30339064759762174, "grad_norm": 0.03345809876918793, "learning_rate": 2e-05, "loss": 0.2553, "step": 236 }, { "epoch": 0.3046762011891371, "grad_norm": 0.03555387631058693, "learning_rate": 2e-05, "loss": 0.2032, "step": 237 }, { "epoch": 0.3059617547806524, "grad_norm": 0.037534430623054504, "learning_rate": 2e-05, "loss": 0.3482, "step": 238 }, { "epoch": 0.30724730837216774, "grad_norm": 0.03810921311378479, "learning_rate": 2e-05, "loss": 0.2931, "step": 239 }, { "epoch": 0.3085328619636831, "grad_norm": 0.03767091780900955, "learning_rate": 2e-05, "loss": 0.3031, "step": 240 }, { "epoch": 0.30981841555519846, "grad_norm": 0.04636585712432861, "learning_rate": 2e-05, "loss": 0.3323, "step": 241 }, { "epoch": 0.3111039691467138, "grad_norm": 0.02405642159283161, "learning_rate": 2e-05, "loss": 0.1964, "step": 242 }, { "epoch": 0.3123895227382292, "grad_norm": 0.03820343688130379, "learning_rate": 2e-05, "loss": 0.265, "step": 243 }, { "epoch": 0.3136750763297445, "grad_norm": 0.04235352948307991, "learning_rate": 2e-05, "loss": 0.3761, "step": 244 }, { "epoch": 0.31496062992125984, "grad_norm": 0.02953983098268509, "learning_rate": 2e-05, "loss": 0.2458, "step": 245 }, { "epoch": 0.31624618351277517, "grad_norm": 0.031593743711709976, "learning_rate": 2e-05, "loss": 0.2278, "step": 246 }, { "epoch": 0.31753173710429056, "grad_norm": 0.033025920391082764, "learning_rate": 2e-05, "loss": 0.2967, "step": 247 }, { "epoch": 0.3188172906958059, "grad_norm": 0.03608924522995949, "learning_rate": 2e-05, "loss": 0.3211, "step": 248 }, { "epoch": 0.3201028442873212, "grad_norm": 0.029520737007260323, "learning_rate": 2e-05, "loss": 0.2659, "step": 249 }, { "epoch": 0.32138839787883655, "grad_norm": 0.043838564306497574, "learning_rate": 2e-05, "loss": 0.4001, "step": 250 }, { "epoch": 0.32267395147035194, "grad_norm": 0.03314085677266121, "learning_rate": 2e-05, "loss": 0.2669, "step": 251 }, { "epoch": 0.32395950506186727, "grad_norm": 0.03647439181804657, "learning_rate": 2e-05, "loss": 0.3063, "step": 252 }, { "epoch": 0.3252450586533826, "grad_norm": 0.03778000921010971, "learning_rate": 2e-05, "loss": 0.3437, "step": 253 }, { "epoch": 0.32653061224489793, "grad_norm": 0.035549599677324295, "learning_rate": 2e-05, "loss": 0.3332, "step": 254 }, { "epoch": 0.3278161658364133, "grad_norm": 0.033758629113435745, "learning_rate": 2e-05, "loss": 0.2372, "step": 255 }, { "epoch": 0.32910171942792865, "grad_norm": 0.04042687267065048, "learning_rate": 2e-05, "loss": 0.2862, "step": 256 }, { "epoch": 0.330387273019444, "grad_norm": 0.032794684171676636, "learning_rate": 2e-05, "loss": 0.2631, "step": 257 }, { "epoch": 0.33167282661095937, "grad_norm": 0.03374920412898064, "learning_rate": 2e-05, "loss": 0.2703, "step": 258 }, { "epoch": 0.3329583802024747, "grad_norm": 0.03981158137321472, "learning_rate": 2e-05, "loss": 0.3021, "step": 259 }, { "epoch": 0.33424393379399003, "grad_norm": 0.034164056181907654, "learning_rate": 2e-05, "loss": 0.3245, "step": 260 }, { "epoch": 0.33552948738550537, "grad_norm": 0.03673673793673515, "learning_rate": 2e-05, "loss": 0.3044, "step": 261 }, { "epoch": 0.33681504097702075, "grad_norm": 0.04251427203416824, "learning_rate": 2e-05, "loss": 0.3132, "step": 262 }, { "epoch": 0.3381005945685361, "grad_norm": 0.055292125791311264, "learning_rate": 2e-05, "loss": 0.3668, "step": 263 }, { "epoch": 0.3393861481600514, "grad_norm": 0.03982202708721161, "learning_rate": 2e-05, "loss": 0.3389, "step": 264 }, { "epoch": 0.34067170175156675, "grad_norm": 0.03548764809966087, "learning_rate": 2e-05, "loss": 0.2439, "step": 265 }, { "epoch": 0.34195725534308213, "grad_norm": 0.04806696996092796, "learning_rate": 2e-05, "loss": 0.3923, "step": 266 }, { "epoch": 0.34324280893459747, "grad_norm": 0.036050595343112946, "learning_rate": 2e-05, "loss": 0.2605, "step": 267 }, { "epoch": 0.3445283625261128, "grad_norm": 0.032735515385866165, "learning_rate": 2e-05, "loss": 0.2451, "step": 268 }, { "epoch": 0.34581391611762813, "grad_norm": 0.039695464074611664, "learning_rate": 2e-05, "loss": 0.3072, "step": 269 }, { "epoch": 0.3470994697091435, "grad_norm": 0.027333933860063553, "learning_rate": 2e-05, "loss": 0.2099, "step": 270 }, { "epoch": 0.34838502330065885, "grad_norm": 0.03149592876434326, "learning_rate": 2e-05, "loss": 0.2613, "step": 271 }, { "epoch": 0.3496705768921742, "grad_norm": 0.031215226277709007, "learning_rate": 2e-05, "loss": 0.2833, "step": 272 }, { "epoch": 0.35095613048368957, "grad_norm": 0.04059711471199989, "learning_rate": 2e-05, "loss": 0.3666, "step": 273 }, { "epoch": 0.3522416840752049, "grad_norm": 0.04247285798192024, "learning_rate": 2e-05, "loss": 0.3758, "step": 274 }, { "epoch": 0.35352723766672023, "grad_norm": 0.034378454089164734, "learning_rate": 2e-05, "loss": 0.2519, "step": 275 }, { "epoch": 0.35481279125823556, "grad_norm": 0.037096619606018066, "learning_rate": 2e-05, "loss": 0.3256, "step": 276 }, { "epoch": 0.35609834484975095, "grad_norm": 0.03536511957645416, "learning_rate": 2e-05, "loss": 0.2596, "step": 277 }, { "epoch": 0.3573838984412663, "grad_norm": 0.046086303889751434, "learning_rate": 2e-05, "loss": 0.3132, "step": 278 }, { "epoch": 0.3586694520327816, "grad_norm": 0.03302552178502083, "learning_rate": 2e-05, "loss": 0.2839, "step": 279 }, { "epoch": 0.35995500562429694, "grad_norm": 0.03423115238547325, "learning_rate": 2e-05, "loss": 0.2678, "step": 280 }, { "epoch": 0.36124055921581233, "grad_norm": 0.03363805264234543, "learning_rate": 2e-05, "loss": 0.2243, "step": 281 }, { "epoch": 0.36252611280732766, "grad_norm": 0.03901209309697151, "learning_rate": 2e-05, "loss": 0.2956, "step": 282 }, { "epoch": 0.363811666398843, "grad_norm": 0.03081115335226059, "learning_rate": 2e-05, "loss": 0.213, "step": 283 }, { "epoch": 0.3650972199903583, "grad_norm": 0.04130322486162186, "learning_rate": 2e-05, "loss": 0.3161, "step": 284 }, { "epoch": 0.3663827735818737, "grad_norm": 0.03694218024611473, "learning_rate": 2e-05, "loss": 0.3228, "step": 285 }, { "epoch": 0.36766832717338904, "grad_norm": 0.048961639404296875, "learning_rate": 2e-05, "loss": 0.3688, "step": 286 }, { "epoch": 0.3689538807649044, "grad_norm": 0.03482965752482414, "learning_rate": 2e-05, "loss": 0.2797, "step": 287 }, { "epoch": 0.37023943435641976, "grad_norm": 0.043517641723155975, "learning_rate": 2e-05, "loss": 0.3395, "step": 288 }, { "epoch": 0.3715249879479351, "grad_norm": 0.03916122019290924, "learning_rate": 2e-05, "loss": 0.3168, "step": 289 }, { "epoch": 0.3728105415394504, "grad_norm": 0.03970535099506378, "learning_rate": 2e-05, "loss": 0.3523, "step": 290 }, { "epoch": 0.37409609513096576, "grad_norm": 0.043576546013355255, "learning_rate": 2e-05, "loss": 0.3974, "step": 291 }, { "epoch": 0.37538164872248114, "grad_norm": 0.03478504344820976, "learning_rate": 2e-05, "loss": 0.2663, "step": 292 }, { "epoch": 0.3766672023139965, "grad_norm": 0.0442640446126461, "learning_rate": 2e-05, "loss": 0.2685, "step": 293 }, { "epoch": 0.3779527559055118, "grad_norm": 0.04135148599743843, "learning_rate": 2e-05, "loss": 0.3765, "step": 294 }, { "epoch": 0.37923830949702714, "grad_norm": 0.03744332864880562, "learning_rate": 2e-05, "loss": 0.3693, "step": 295 }, { "epoch": 0.3805238630885425, "grad_norm": 0.038954440504312515, "learning_rate": 2e-05, "loss": 0.289, "step": 296 }, { "epoch": 0.38180941668005786, "grad_norm": 0.031730618327856064, "learning_rate": 2e-05, "loss": 0.2271, "step": 297 }, { "epoch": 0.3830949702715732, "grad_norm": 0.04433518648147583, "learning_rate": 2e-05, "loss": 0.4079, "step": 298 }, { "epoch": 0.3843805238630885, "grad_norm": 0.04384070262312889, "learning_rate": 2e-05, "loss": 0.3005, "step": 299 }, { "epoch": 0.3856660774546039, "grad_norm": 0.03004288114607334, "learning_rate": 2e-05, "loss": 0.2113, "step": 300 }, { "epoch": 0.38695163104611924, "grad_norm": 0.0353570394217968, "learning_rate": 2e-05, "loss": 0.2198, "step": 301 }, { "epoch": 0.38823718463763457, "grad_norm": 0.04267432913184166, "learning_rate": 2e-05, "loss": 0.3431, "step": 302 }, { "epoch": 0.3895227382291499, "grad_norm": 0.04084617272019386, "learning_rate": 2e-05, "loss": 0.3099, "step": 303 }, { "epoch": 0.3908082918206653, "grad_norm": 0.059954188764095306, "learning_rate": 2e-05, "loss": 0.3891, "step": 304 }, { "epoch": 0.3920938454121806, "grad_norm": 0.03467090055346489, "learning_rate": 2e-05, "loss": 0.2383, "step": 305 }, { "epoch": 0.39337939900369595, "grad_norm": 0.03164566680788994, "learning_rate": 2e-05, "loss": 0.2079, "step": 306 }, { "epoch": 0.39466495259521134, "grad_norm": 0.048123132437467575, "learning_rate": 2e-05, "loss": 0.3726, "step": 307 }, { "epoch": 0.39595050618672667, "grad_norm": 0.03534458950161934, "learning_rate": 2e-05, "loss": 0.2651, "step": 308 }, { "epoch": 0.397236059778242, "grad_norm": 0.03836483508348465, "learning_rate": 2e-05, "loss": 0.3101, "step": 309 }, { "epoch": 0.39852161336975733, "grad_norm": 0.047910891473293304, "learning_rate": 2e-05, "loss": 0.3234, "step": 310 }, { "epoch": 0.3998071669612727, "grad_norm": 0.027741173282265663, "learning_rate": 2e-05, "loss": 0.1632, "step": 311 }, { "epoch": 0.40109272055278805, "grad_norm": 0.0344574935734272, "learning_rate": 2e-05, "loss": 0.2463, "step": 312 }, { "epoch": 0.4023782741443034, "grad_norm": 0.032118018716573715, "learning_rate": 2e-05, "loss": 0.2298, "step": 313 }, { "epoch": 0.4036638277358187, "grad_norm": 0.040490612387657166, "learning_rate": 2e-05, "loss": 0.3247, "step": 314 }, { "epoch": 0.4049493813273341, "grad_norm": 0.03369493409991264, "learning_rate": 2e-05, "loss": 0.2088, "step": 315 }, { "epoch": 0.40623493491884943, "grad_norm": 0.04419386386871338, "learning_rate": 2e-05, "loss": 0.3354, "step": 316 }, { "epoch": 0.40752048851036476, "grad_norm": 0.04048989340662956, "learning_rate": 2e-05, "loss": 0.2988, "step": 317 }, { "epoch": 0.4088060421018801, "grad_norm": 0.040915414690971375, "learning_rate": 2e-05, "loss": 0.2315, "step": 318 }, { "epoch": 0.4100915956933955, "grad_norm": 0.03020886704325676, "learning_rate": 2e-05, "loss": 0.2137, "step": 319 }, { "epoch": 0.4113771492849108, "grad_norm": 0.0413849912583828, "learning_rate": 2e-05, "loss": 0.3479, "step": 320 }, { "epoch": 0.41266270287642615, "grad_norm": 0.04639044404029846, "learning_rate": 2e-05, "loss": 0.3689, "step": 321 }, { "epoch": 0.41394825646794153, "grad_norm": 0.044351786375045776, "learning_rate": 2e-05, "loss": 0.3488, "step": 322 }, { "epoch": 0.41523381005945686, "grad_norm": 0.030558589845895767, "learning_rate": 2e-05, "loss": 0.2211, "step": 323 }, { "epoch": 0.4165193636509722, "grad_norm": 0.03329205513000488, "learning_rate": 2e-05, "loss": 0.2282, "step": 324 }, { "epoch": 0.41780491724248753, "grad_norm": 0.04240158200263977, "learning_rate": 2e-05, "loss": 0.2571, "step": 325 }, { "epoch": 0.4190904708340029, "grad_norm": 0.040866266936063766, "learning_rate": 2e-05, "loss": 0.289, "step": 326 }, { "epoch": 0.42037602442551825, "grad_norm": 0.04475086182355881, "learning_rate": 2e-05, "loss": 0.2889, "step": 327 }, { "epoch": 0.4216615780170336, "grad_norm": 0.03587472438812256, "learning_rate": 2e-05, "loss": 0.2452, "step": 328 }, { "epoch": 0.4229471316085489, "grad_norm": 0.04346352815628052, "learning_rate": 2e-05, "loss": 0.3751, "step": 329 }, { "epoch": 0.4242326852000643, "grad_norm": 0.03417763113975525, "learning_rate": 2e-05, "loss": 0.2825, "step": 330 }, { "epoch": 0.42551823879157963, "grad_norm": 0.030223989859223366, "learning_rate": 2e-05, "loss": 0.2339, "step": 331 }, { "epoch": 0.42680379238309496, "grad_norm": 0.0342961922287941, "learning_rate": 2e-05, "loss": 0.2616, "step": 332 }, { "epoch": 0.4280893459746103, "grad_norm": 0.04207473620772362, "learning_rate": 2e-05, "loss": 0.265, "step": 333 }, { "epoch": 0.4293748995661257, "grad_norm": 0.03148888424038887, "learning_rate": 2e-05, "loss": 0.1792, "step": 334 }, { "epoch": 0.430660453157641, "grad_norm": 0.039937492460012436, "learning_rate": 2e-05, "loss": 0.2502, "step": 335 }, { "epoch": 0.43194600674915634, "grad_norm": 0.03943054750561714, "learning_rate": 2e-05, "loss": 0.2733, "step": 336 }, { "epoch": 0.43323156034067173, "grad_norm": 0.03569771721959114, "learning_rate": 2e-05, "loss": 0.2099, "step": 337 }, { "epoch": 0.43451711393218706, "grad_norm": 0.036599624902009964, "learning_rate": 2e-05, "loss": 0.2478, "step": 338 }, { "epoch": 0.4358026675237024, "grad_norm": 0.054707758128643036, "learning_rate": 2e-05, "loss": 0.4257, "step": 339 }, { "epoch": 0.4370882211152177, "grad_norm": 0.0450870580971241, "learning_rate": 2e-05, "loss": 0.3091, "step": 340 }, { "epoch": 0.4383737747067331, "grad_norm": 0.03818565234541893, "learning_rate": 2e-05, "loss": 0.2781, "step": 341 }, { "epoch": 0.43965932829824844, "grad_norm": 0.03722561523318291, "learning_rate": 2e-05, "loss": 0.2602, "step": 342 }, { "epoch": 0.4409448818897638, "grad_norm": 0.038348764181137085, "learning_rate": 2e-05, "loss": 0.286, "step": 343 }, { "epoch": 0.4422304354812791, "grad_norm": 0.02572775073349476, "learning_rate": 2e-05, "loss": 0.1886, "step": 344 }, { "epoch": 0.4435159890727945, "grad_norm": 0.03972122073173523, "learning_rate": 2e-05, "loss": 0.2756, "step": 345 }, { "epoch": 0.4448015426643098, "grad_norm": 0.03696167469024658, "learning_rate": 2e-05, "loss": 0.2526, "step": 346 }, { "epoch": 0.44608709625582516, "grad_norm": 0.03587668761610985, "learning_rate": 2e-05, "loss": 0.2044, "step": 347 }, { "epoch": 0.4473726498473405, "grad_norm": 0.03959975019097328, "learning_rate": 2e-05, "loss": 0.3007, "step": 348 }, { "epoch": 0.4486582034388559, "grad_norm": 0.03879138454794884, "learning_rate": 2e-05, "loss": 0.2382, "step": 349 }, { "epoch": 0.4499437570303712, "grad_norm": 0.05302846059203148, "learning_rate": 2e-05, "loss": 0.3375, "step": 350 }, { "epoch": 0.45122931062188654, "grad_norm": 0.039411693811416626, "learning_rate": 2e-05, "loss": 0.2662, "step": 351 }, { "epoch": 0.4525148642134019, "grad_norm": 0.03571093827486038, "learning_rate": 2e-05, "loss": 0.2054, "step": 352 }, { "epoch": 0.45380041780491726, "grad_norm": 0.0486789233982563, "learning_rate": 2e-05, "loss": 0.3314, "step": 353 }, { "epoch": 0.4550859713964326, "grad_norm": 0.037670183926820755, "learning_rate": 2e-05, "loss": 0.2484, "step": 354 }, { "epoch": 0.4563715249879479, "grad_norm": 0.056887123733758926, "learning_rate": 2e-05, "loss": 0.3562, "step": 355 }, { "epoch": 0.4576570785794633, "grad_norm": 0.04562405124306679, "learning_rate": 2e-05, "loss": 0.2869, "step": 356 }, { "epoch": 0.45894263217097864, "grad_norm": 0.040491264313459396, "learning_rate": 2e-05, "loss": 0.3541, "step": 357 }, { "epoch": 0.46022818576249397, "grad_norm": 0.04283326864242554, "learning_rate": 2e-05, "loss": 0.2674, "step": 358 }, { "epoch": 0.4615137393540093, "grad_norm": 0.05063975229859352, "learning_rate": 2e-05, "loss": 0.4013, "step": 359 }, { "epoch": 0.4627992929455247, "grad_norm": 0.037555571645498276, "learning_rate": 2e-05, "loss": 0.2419, "step": 360 }, { "epoch": 0.46408484653704, "grad_norm": 0.036944594234228134, "learning_rate": 2e-05, "loss": 0.2426, "step": 361 }, { "epoch": 0.46537040012855535, "grad_norm": 0.05010130628943443, "learning_rate": 2e-05, "loss": 0.2996, "step": 362 }, { "epoch": 0.4666559537200707, "grad_norm": 0.0335206501185894, "learning_rate": 2e-05, "loss": 0.2451, "step": 363 }, { "epoch": 0.46794150731158607, "grad_norm": 0.052481383085250854, "learning_rate": 2e-05, "loss": 0.3812, "step": 364 }, { "epoch": 0.4692270609031014, "grad_norm": 0.04185234755277634, "learning_rate": 2e-05, "loss": 0.274, "step": 365 }, { "epoch": 0.47051261449461673, "grad_norm": 0.03707558289170265, "learning_rate": 2e-05, "loss": 0.2505, "step": 366 }, { "epoch": 0.47179816808613206, "grad_norm": 0.060728251934051514, "learning_rate": 2e-05, "loss": 0.3279, "step": 367 }, { "epoch": 0.47308372167764745, "grad_norm": 0.031999371945858, "learning_rate": 2e-05, "loss": 0.1866, "step": 368 }, { "epoch": 0.4743692752691628, "grad_norm": 0.044399287551641464, "learning_rate": 2e-05, "loss": 0.249, "step": 369 }, { "epoch": 0.4756548288606781, "grad_norm": 0.05057983100414276, "learning_rate": 2e-05, "loss": 0.3612, "step": 370 }, { "epoch": 0.4769403824521935, "grad_norm": 0.039979059249162674, "learning_rate": 2e-05, "loss": 0.2684, "step": 371 }, { "epoch": 0.47822593604370883, "grad_norm": 0.03305087611079216, "learning_rate": 2e-05, "loss": 0.2164, "step": 372 }, { "epoch": 0.47951148963522416, "grad_norm": 0.045574892312288284, "learning_rate": 2e-05, "loss": 0.3127, "step": 373 }, { "epoch": 0.4807970432267395, "grad_norm": 0.05269627645611763, "learning_rate": 2e-05, "loss": 0.3315, "step": 374 }, { "epoch": 0.4820825968182549, "grad_norm": 0.06162478029727936, "learning_rate": 2e-05, "loss": 0.3347, "step": 375 }, { "epoch": 0.4833681504097702, "grad_norm": 0.04428340122103691, "learning_rate": 2e-05, "loss": 0.2794, "step": 376 }, { "epoch": 0.48465370400128555, "grad_norm": 0.04249970242381096, "learning_rate": 2e-05, "loss": 0.2781, "step": 377 }, { "epoch": 0.4859392575928009, "grad_norm": 0.04270468279719353, "learning_rate": 2e-05, "loss": 0.2878, "step": 378 }, { "epoch": 0.48722481118431626, "grad_norm": 0.036853183060884476, "learning_rate": 2e-05, "loss": 0.2548, "step": 379 }, { "epoch": 0.4885103647758316, "grad_norm": 0.03981437534093857, "learning_rate": 2e-05, "loss": 0.2743, "step": 380 }, { "epoch": 0.4897959183673469, "grad_norm": 0.04621482267975807, "learning_rate": 2e-05, "loss": 0.3524, "step": 381 }, { "epoch": 0.49108147195886226, "grad_norm": 0.04479382932186127, "learning_rate": 2e-05, "loss": 0.3013, "step": 382 }, { "epoch": 0.49236702555037765, "grad_norm": 0.0524832084774971, "learning_rate": 2e-05, "loss": 0.4674, "step": 383 }, { "epoch": 0.493652579141893, "grad_norm": 0.05657699331641197, "learning_rate": 2e-05, "loss": 0.3734, "step": 384 }, { "epoch": 0.4949381327334083, "grad_norm": 0.05035189166665077, "learning_rate": 2e-05, "loss": 0.2522, "step": 385 }, { "epoch": 0.4962236863249237, "grad_norm": 0.045344091951847076, "learning_rate": 2e-05, "loss": 0.3121, "step": 386 }, { "epoch": 0.49750923991643903, "grad_norm": 0.038680486381053925, "learning_rate": 2e-05, "loss": 0.2999, "step": 387 }, { "epoch": 0.49879479350795436, "grad_norm": 0.03980954363942146, "learning_rate": 2e-05, "loss": 0.2476, "step": 388 }, { "epoch": 0.5000803470994697, "grad_norm": 0.04812563210725784, "learning_rate": 2e-05, "loss": 0.3218, "step": 389 }, { "epoch": 0.5013659006909851, "grad_norm": 0.04132760316133499, "learning_rate": 2e-05, "loss": 0.2344, "step": 390 }, { "epoch": 0.5026514542825004, "grad_norm": 0.03867589682340622, "learning_rate": 2e-05, "loss": 0.2172, "step": 391 }, { "epoch": 0.5039370078740157, "grad_norm": 0.05404170975089073, "learning_rate": 2e-05, "loss": 0.3489, "step": 392 }, { "epoch": 0.5052225614655311, "grad_norm": 0.05424851179122925, "learning_rate": 2e-05, "loss": 0.3908, "step": 393 }, { "epoch": 0.5065081150570464, "grad_norm": 0.046993743628263474, "learning_rate": 2e-05, "loss": 0.3133, "step": 394 }, { "epoch": 0.5077936686485618, "grad_norm": 0.038952894508838654, "learning_rate": 2e-05, "loss": 0.273, "step": 395 }, { "epoch": 0.5090792222400772, "grad_norm": 0.039642345160245895, "learning_rate": 2e-05, "loss": 0.2163, "step": 396 }, { "epoch": 0.5103647758315925, "grad_norm": 0.05045924335718155, "learning_rate": 2e-05, "loss": 0.3934, "step": 397 }, { "epoch": 0.5116503294231078, "grad_norm": 0.03384791314601898, "learning_rate": 2e-05, "loss": 0.2427, "step": 398 }, { "epoch": 0.5129358830146231, "grad_norm": 0.04521351680159569, "learning_rate": 2e-05, "loss": 0.3329, "step": 399 }, { "epoch": 0.5142214366061385, "grad_norm": 0.044563427567481995, "learning_rate": 2e-05, "loss": 0.327, "step": 400 }, { "epoch": 0.5155069901976539, "grad_norm": 0.027659917250275612, "learning_rate": 2e-05, "loss": 0.1984, "step": 401 }, { "epoch": 0.5167925437891692, "grad_norm": 0.047275714576244354, "learning_rate": 2e-05, "loss": 0.2867, "step": 402 }, { "epoch": 0.5180780973806846, "grad_norm": 0.04775230586528778, "learning_rate": 2e-05, "loss": 0.355, "step": 403 }, { "epoch": 0.5193636509721999, "grad_norm": 0.04720161855220795, "learning_rate": 2e-05, "loss": 0.2423, "step": 404 }, { "epoch": 0.5206492045637152, "grad_norm": 0.04180417209863663, "learning_rate": 2e-05, "loss": 0.2688, "step": 405 }, { "epoch": 0.5219347581552306, "grad_norm": 0.05189646780490875, "learning_rate": 2e-05, "loss": 0.3736, "step": 406 }, { "epoch": 0.523220311746746, "grad_norm": 0.04067251831293106, "learning_rate": 2e-05, "loss": 0.3039, "step": 407 }, { "epoch": 0.5245058653382613, "grad_norm": 0.05931917205452919, "learning_rate": 2e-05, "loss": 0.3374, "step": 408 }, { "epoch": 0.5257914189297767, "grad_norm": 0.04547608271241188, "learning_rate": 2e-05, "loss": 0.2968, "step": 409 }, { "epoch": 0.5270769725212919, "grad_norm": 0.04650389403104782, "learning_rate": 2e-05, "loss": 0.2854, "step": 410 }, { "epoch": 0.5283625261128073, "grad_norm": 0.05240015685558319, "learning_rate": 2e-05, "loss": 0.2837, "step": 411 }, { "epoch": 0.5296480797043227, "grad_norm": 0.05040004476904869, "learning_rate": 2e-05, "loss": 0.2923, "step": 412 }, { "epoch": 0.530933633295838, "grad_norm": 0.04871930554509163, "learning_rate": 2e-05, "loss": 0.2414, "step": 413 }, { "epoch": 0.5322191868873534, "grad_norm": 0.04192574322223663, "learning_rate": 2e-05, "loss": 0.2764, "step": 414 }, { "epoch": 0.5335047404788688, "grad_norm": 0.05296563729643822, "learning_rate": 2e-05, "loss": 0.2723, "step": 415 }, { "epoch": 0.534790294070384, "grad_norm": 0.03959592431783676, "learning_rate": 2e-05, "loss": 0.2204, "step": 416 }, { "epoch": 0.5360758476618994, "grad_norm": 0.03962741047143936, "learning_rate": 2e-05, "loss": 0.2518, "step": 417 }, { "epoch": 0.5373614012534148, "grad_norm": 0.040081944316625595, "learning_rate": 2e-05, "loss": 0.2573, "step": 418 }, { "epoch": 0.5386469548449301, "grad_norm": 0.04713954031467438, "learning_rate": 2e-05, "loss": 0.2925, "step": 419 }, { "epoch": 0.5399325084364455, "grad_norm": 0.05657007545232773, "learning_rate": 2e-05, "loss": 0.4272, "step": 420 }, { "epoch": 0.5412180620279607, "grad_norm": 0.05307560786604881, "learning_rate": 2e-05, "loss": 0.316, "step": 421 }, { "epoch": 0.5425036156194761, "grad_norm": 0.04280155152082443, "learning_rate": 2e-05, "loss": 0.2614, "step": 422 }, { "epoch": 0.5437891692109915, "grad_norm": 0.03501439467072487, "learning_rate": 2e-05, "loss": 0.2318, "step": 423 }, { "epoch": 0.5450747228025068, "grad_norm": 0.05088590830564499, "learning_rate": 2e-05, "loss": 0.3533, "step": 424 }, { "epoch": 0.5463602763940222, "grad_norm": 0.03503134846687317, "learning_rate": 2e-05, "loss": 0.2079, "step": 425 }, { "epoch": 0.5476458299855376, "grad_norm": 0.043812718242406845, "learning_rate": 2e-05, "loss": 0.3205, "step": 426 }, { "epoch": 0.5489313835770528, "grad_norm": 0.05358745902776718, "learning_rate": 2e-05, "loss": 0.3713, "step": 427 }, { "epoch": 0.5502169371685682, "grad_norm": 0.042078517377376556, "learning_rate": 2e-05, "loss": 0.2371, "step": 428 }, { "epoch": 0.5515024907600835, "grad_norm": 0.04489399120211601, "learning_rate": 2e-05, "loss": 0.2832, "step": 429 }, { "epoch": 0.5527880443515989, "grad_norm": 0.04766567423939705, "learning_rate": 2e-05, "loss": 0.2151, "step": 430 }, { "epoch": 0.5540735979431143, "grad_norm": 0.04447382688522339, "learning_rate": 2e-05, "loss": 0.2317, "step": 431 }, { "epoch": 0.5553591515346296, "grad_norm": 0.04144001007080078, "learning_rate": 2e-05, "loss": 0.2667, "step": 432 }, { "epoch": 0.556644705126145, "grad_norm": 0.04112810641527176, "learning_rate": 2e-05, "loss": 0.2758, "step": 433 }, { "epoch": 0.5579302587176603, "grad_norm": 0.032402511686086655, "learning_rate": 2e-05, "loss": 0.2047, "step": 434 }, { "epoch": 0.5592158123091756, "grad_norm": 0.04352883994579315, "learning_rate": 2e-05, "loss": 0.323, "step": 435 }, { "epoch": 0.560501365900691, "grad_norm": 0.0496109239757061, "learning_rate": 2e-05, "loss": 0.2962, "step": 436 }, { "epoch": 0.5617869194922064, "grad_norm": 0.04593720659613609, "learning_rate": 2e-05, "loss": 0.3705, "step": 437 }, { "epoch": 0.5630724730837217, "grad_norm": 0.040998801589012146, "learning_rate": 2e-05, "loss": 0.2219, "step": 438 }, { "epoch": 0.564358026675237, "grad_norm": 0.04891293868422508, "learning_rate": 2e-05, "loss": 0.2539, "step": 439 }, { "epoch": 0.5656435802667523, "grad_norm": 0.04628092423081398, "learning_rate": 2e-05, "loss": 0.2521, "step": 440 }, { "epoch": 0.5669291338582677, "grad_norm": 0.03929414600133896, "learning_rate": 2e-05, "loss": 0.1931, "step": 441 }, { "epoch": 0.5682146874497831, "grad_norm": 0.03937762975692749, "learning_rate": 2e-05, "loss": 0.2225, "step": 442 }, { "epoch": 0.5695002410412984, "grad_norm": 0.057498469948768616, "learning_rate": 2e-05, "loss": 0.4021, "step": 443 }, { "epoch": 0.5707857946328138, "grad_norm": 0.04665215313434601, "learning_rate": 2e-05, "loss": 0.3026, "step": 444 }, { "epoch": 0.5720713482243291, "grad_norm": 0.04521113634109497, "learning_rate": 2e-05, "loss": 0.2592, "step": 445 }, { "epoch": 0.5733569018158444, "grad_norm": 0.038349051028490067, "learning_rate": 2e-05, "loss": 0.2501, "step": 446 }, { "epoch": 0.5746424554073598, "grad_norm": 0.04515808820724487, "learning_rate": 2e-05, "loss": 0.3092, "step": 447 }, { "epoch": 0.5759280089988752, "grad_norm": 0.047012921422719955, "learning_rate": 2e-05, "loss": 0.3338, "step": 448 }, { "epoch": 0.5772135625903905, "grad_norm": 0.0472906231880188, "learning_rate": 2e-05, "loss": 0.3139, "step": 449 }, { "epoch": 0.5784991161819059, "grad_norm": 0.04748733341693878, "learning_rate": 2e-05, "loss": 0.2414, "step": 450 }, { "epoch": 0.5797846697734211, "grad_norm": 0.03514058515429497, "learning_rate": 2e-05, "loss": 0.1946, "step": 451 }, { "epoch": 0.5810702233649365, "grad_norm": 0.050174906849861145, "learning_rate": 2e-05, "loss": 0.3284, "step": 452 }, { "epoch": 0.5823557769564519, "grad_norm": 0.05283737555146217, "learning_rate": 2e-05, "loss": 0.3073, "step": 453 }, { "epoch": 0.5836413305479672, "grad_norm": 0.04498602822422981, "learning_rate": 2e-05, "loss": 0.2604, "step": 454 }, { "epoch": 0.5849268841394826, "grad_norm": 0.042758163064718246, "learning_rate": 2e-05, "loss": 0.2221, "step": 455 }, { "epoch": 0.586212437730998, "grad_norm": 0.041656941175460815, "learning_rate": 2e-05, "loss": 0.2491, "step": 456 }, { "epoch": 0.5874979913225132, "grad_norm": 0.03713398054242134, "learning_rate": 2e-05, "loss": 0.1754, "step": 457 }, { "epoch": 0.5887835449140286, "grad_norm": 0.0447508729994297, "learning_rate": 2e-05, "loss": 0.2792, "step": 458 }, { "epoch": 0.5900690985055439, "grad_norm": 0.04686212167143822, "learning_rate": 2e-05, "loss": 0.2609, "step": 459 }, { "epoch": 0.5913546520970593, "grad_norm": 0.040732961148023605, "learning_rate": 2e-05, "loss": 0.2089, "step": 460 }, { "epoch": 0.5926402056885747, "grad_norm": 0.04114542156457901, "learning_rate": 2e-05, "loss": 0.2315, "step": 461 }, { "epoch": 0.59392575928009, "grad_norm": 0.040324702858924866, "learning_rate": 2e-05, "loss": 0.2778, "step": 462 }, { "epoch": 0.5952113128716053, "grad_norm": 0.0678023248910904, "learning_rate": 2e-05, "loss": 0.3029, "step": 463 }, { "epoch": 0.5964968664631207, "grad_norm": 0.04701264947652817, "learning_rate": 2e-05, "loss": 0.2829, "step": 464 }, { "epoch": 0.597782420054636, "grad_norm": 0.03481682017445564, "learning_rate": 2e-05, "loss": 0.2345, "step": 465 }, { "epoch": 0.5990679736461514, "grad_norm": 0.0509064756333828, "learning_rate": 2e-05, "loss": 0.303, "step": 466 }, { "epoch": 0.6003535272376668, "grad_norm": 0.052839163690805435, "learning_rate": 2e-05, "loss": 0.2798, "step": 467 }, { "epoch": 0.601639080829182, "grad_norm": 0.03605001047253609, "learning_rate": 2e-05, "loss": 0.1783, "step": 468 }, { "epoch": 0.6029246344206974, "grad_norm": 0.03640325739979744, "learning_rate": 2e-05, "loss": 0.2498, "step": 469 }, { "epoch": 0.6042101880122127, "grad_norm": 0.03874512016773224, "learning_rate": 2e-05, "loss": 0.1996, "step": 470 }, { "epoch": 0.6054957416037281, "grad_norm": 0.03477559611201286, "learning_rate": 2e-05, "loss": 0.2121, "step": 471 }, { "epoch": 0.6067812951952435, "grad_norm": 0.04953417927026749, "learning_rate": 2e-05, "loss": 0.2821, "step": 472 }, { "epoch": 0.6080668487867588, "grad_norm": 0.04992024600505829, "learning_rate": 2e-05, "loss": 0.362, "step": 473 }, { "epoch": 0.6093524023782741, "grad_norm": 0.048429060727357864, "learning_rate": 2e-05, "loss": 0.2217, "step": 474 }, { "epoch": 0.6106379559697895, "grad_norm": 0.05344587191939354, "learning_rate": 2e-05, "loss": 0.2989, "step": 475 }, { "epoch": 0.6119235095613048, "grad_norm": 0.04274825379252434, "learning_rate": 2e-05, "loss": 0.2424, "step": 476 }, { "epoch": 0.6132090631528202, "grad_norm": 0.04651128128170967, "learning_rate": 2e-05, "loss": 0.3412, "step": 477 }, { "epoch": 0.6144946167443355, "grad_norm": 0.05821945145726204, "learning_rate": 2e-05, "loss": 0.2726, "step": 478 }, { "epoch": 0.6157801703358509, "grad_norm": 0.0519278421998024, "learning_rate": 2e-05, "loss": 0.2927, "step": 479 }, { "epoch": 0.6170657239273662, "grad_norm": 0.03331352025270462, "learning_rate": 2e-05, "loss": 0.1688, "step": 480 }, { "epoch": 0.6183512775188815, "grad_norm": 0.04451346397399902, "learning_rate": 2e-05, "loss": 0.213, "step": 481 }, { "epoch": 0.6196368311103969, "grad_norm": 0.04776597023010254, "learning_rate": 2e-05, "loss": 0.2826, "step": 482 }, { "epoch": 0.6209223847019123, "grad_norm": 0.0488264262676239, "learning_rate": 2e-05, "loss": 0.2886, "step": 483 }, { "epoch": 0.6222079382934276, "grad_norm": 0.04393550381064415, "learning_rate": 2e-05, "loss": 0.1949, "step": 484 }, { "epoch": 0.623493491884943, "grad_norm": 0.050872016698122025, "learning_rate": 2e-05, "loss": 0.2201, "step": 485 }, { "epoch": 0.6247790454764584, "grad_norm": 0.06177595257759094, "learning_rate": 2e-05, "loss": 0.318, "step": 486 }, { "epoch": 0.6260645990679736, "grad_norm": 0.03842415288090706, "learning_rate": 2e-05, "loss": 0.1763, "step": 487 }, { "epoch": 0.627350152659489, "grad_norm": 0.04788699373602867, "learning_rate": 2e-05, "loss": 0.247, "step": 488 }, { "epoch": 0.6286357062510043, "grad_norm": 0.05789102241396904, "learning_rate": 2e-05, "loss": 0.3338, "step": 489 }, { "epoch": 0.6299212598425197, "grad_norm": 0.04298072308301926, "learning_rate": 2e-05, "loss": 0.2321, "step": 490 }, { "epoch": 0.6312068134340351, "grad_norm": 0.03914102911949158, "learning_rate": 2e-05, "loss": 0.2341, "step": 491 }, { "epoch": 0.6324923670255503, "grad_norm": 0.04699448123574257, "learning_rate": 2e-05, "loss": 0.2462, "step": 492 }, { "epoch": 0.6337779206170657, "grad_norm": 0.04092938452959061, "learning_rate": 2e-05, "loss": 0.2378, "step": 493 }, { "epoch": 0.6350634742085811, "grad_norm": 0.0463721826672554, "learning_rate": 2e-05, "loss": 0.225, "step": 494 }, { "epoch": 0.6363490278000964, "grad_norm": 0.0489421971142292, "learning_rate": 2e-05, "loss": 0.2341, "step": 495 }, { "epoch": 0.6376345813916118, "grad_norm": 0.04278067871928215, "learning_rate": 2e-05, "loss": 0.234, "step": 496 }, { "epoch": 0.6389201349831272, "grad_norm": 0.04674089327454567, "learning_rate": 2e-05, "loss": 0.2755, "step": 497 }, { "epoch": 0.6402056885746424, "grad_norm": 0.056766536086797714, "learning_rate": 2e-05, "loss": 0.3656, "step": 498 }, { "epoch": 0.6414912421661578, "grad_norm": 0.04216759279370308, "learning_rate": 2e-05, "loss": 0.2931, "step": 499 }, { "epoch": 0.6427767957576731, "grad_norm": 0.04742797464132309, "learning_rate": 2e-05, "loss": 0.3386, "step": 500 }, { "epoch": 0.6440623493491885, "grad_norm": 0.05907592922449112, "learning_rate": 2e-05, "loss": 0.3194, "step": 501 }, { "epoch": 0.6453479029407039, "grad_norm": 0.047280214726924896, "learning_rate": 2e-05, "loss": 0.2993, "step": 502 }, { "epoch": 0.6466334565322192, "grad_norm": 0.03869684040546417, "learning_rate": 2e-05, "loss": 0.158, "step": 503 }, { "epoch": 0.6479190101237345, "grad_norm": 0.04897621273994446, "learning_rate": 2e-05, "loss": 0.1948, "step": 504 }, { "epoch": 0.6492045637152499, "grad_norm": 0.055846258997917175, "learning_rate": 2e-05, "loss": 0.312, "step": 505 }, { "epoch": 0.6504901173067652, "grad_norm": 0.04266876354813576, "learning_rate": 2e-05, "loss": 0.2377, "step": 506 }, { "epoch": 0.6517756708982806, "grad_norm": 0.050029207020998, "learning_rate": 2e-05, "loss": 0.1797, "step": 507 }, { "epoch": 0.6530612244897959, "grad_norm": 0.035082824528217316, "learning_rate": 2e-05, "loss": 0.1799, "step": 508 }, { "epoch": 0.6543467780813113, "grad_norm": 0.04430130124092102, "learning_rate": 2e-05, "loss": 0.2327, "step": 509 }, { "epoch": 0.6556323316728266, "grad_norm": 0.03854670003056526, "learning_rate": 2e-05, "loss": 0.2235, "step": 510 }, { "epoch": 0.6569178852643419, "grad_norm": 0.04970936104655266, "learning_rate": 2e-05, "loss": 0.2818, "step": 511 }, { "epoch": 0.6582034388558573, "grad_norm": 0.04700899496674538, "learning_rate": 2e-05, "loss": 0.2417, "step": 512 }, { "epoch": 0.6594889924473727, "grad_norm": 0.04256317391991615, "learning_rate": 2e-05, "loss": 0.2703, "step": 513 }, { "epoch": 0.660774546038888, "grad_norm": 0.04744260385632515, "learning_rate": 2e-05, "loss": 0.2048, "step": 514 }, { "epoch": 0.6620600996304034, "grad_norm": 0.04310823976993561, "learning_rate": 2e-05, "loss": 0.1897, "step": 515 }, { "epoch": 0.6633456532219187, "grad_norm": 0.04300684109330177, "learning_rate": 2e-05, "loss": 0.2139, "step": 516 }, { "epoch": 0.664631206813434, "grad_norm": 0.05581510066986084, "learning_rate": 2e-05, "loss": 0.2616, "step": 517 }, { "epoch": 0.6659167604049494, "grad_norm": 0.055505942553281784, "learning_rate": 2e-05, "loss": 0.2915, "step": 518 }, { "epoch": 0.6672023139964647, "grad_norm": 0.040814101696014404, "learning_rate": 2e-05, "loss": 0.2187, "step": 519 }, { "epoch": 0.6684878675879801, "grad_norm": 0.05864616110920906, "learning_rate": 2e-05, "loss": 0.3499, "step": 520 }, { "epoch": 0.6697734211794955, "grad_norm": 0.057373858988285065, "learning_rate": 2e-05, "loss": 0.3538, "step": 521 }, { "epoch": 0.6710589747710107, "grad_norm": 0.041141483932733536, "learning_rate": 2e-05, "loss": 0.2711, "step": 522 }, { "epoch": 0.6723445283625261, "grad_norm": 0.03994324058294296, "learning_rate": 2e-05, "loss": 0.182, "step": 523 }, { "epoch": 0.6736300819540415, "grad_norm": 0.04982011020183563, "learning_rate": 2e-05, "loss": 0.2911, "step": 524 }, { "epoch": 0.6749156355455568, "grad_norm": 0.04852016270160675, "learning_rate": 2e-05, "loss": 0.254, "step": 525 }, { "epoch": 0.6762011891370722, "grad_norm": 0.05752996355295181, "learning_rate": 2e-05, "loss": 0.2969, "step": 526 }, { "epoch": 0.6774867427285874, "grad_norm": 0.04058138653635979, "learning_rate": 2e-05, "loss": 0.1861, "step": 527 }, { "epoch": 0.6787722963201028, "grad_norm": 0.05575535446405411, "learning_rate": 2e-05, "loss": 0.3174, "step": 528 }, { "epoch": 0.6800578499116182, "grad_norm": 0.0468176007270813, "learning_rate": 2e-05, "loss": 0.2699, "step": 529 }, { "epoch": 0.6813434035031335, "grad_norm": 0.054678115993738174, "learning_rate": 2e-05, "loss": 0.3051, "step": 530 }, { "epoch": 0.6826289570946489, "grad_norm": 0.055189572274684906, "learning_rate": 2e-05, "loss": 0.2397, "step": 531 }, { "epoch": 0.6839145106861643, "grad_norm": 0.048087868839502335, "learning_rate": 2e-05, "loss": 0.2302, "step": 532 }, { "epoch": 0.6852000642776795, "grad_norm": 0.057727813720703125, "learning_rate": 2e-05, "loss": 0.2457, "step": 533 }, { "epoch": 0.6864856178691949, "grad_norm": 0.04846923425793648, "learning_rate": 2e-05, "loss": 0.2506, "step": 534 }, { "epoch": 0.6877711714607103, "grad_norm": 0.0410042330622673, "learning_rate": 2e-05, "loss": 0.198, "step": 535 }, { "epoch": 0.6890567250522256, "grad_norm": 0.05333555117249489, "learning_rate": 2e-05, "loss": 0.283, "step": 536 }, { "epoch": 0.690342278643741, "grad_norm": 0.05376364290714264, "learning_rate": 2e-05, "loss": 0.3337, "step": 537 }, { "epoch": 0.6916278322352563, "grad_norm": 0.04879291355609894, "learning_rate": 2e-05, "loss": 0.3075, "step": 538 }, { "epoch": 0.6929133858267716, "grad_norm": 0.0375969335436821, "learning_rate": 2e-05, "loss": 0.1652, "step": 539 }, { "epoch": 0.694198939418287, "grad_norm": 0.042424045503139496, "learning_rate": 2e-05, "loss": 0.2398, "step": 540 }, { "epoch": 0.6954844930098023, "grad_norm": 0.048496536910533905, "learning_rate": 2e-05, "loss": 0.239, "step": 541 }, { "epoch": 0.6967700466013177, "grad_norm": 0.04180686175823212, "learning_rate": 2e-05, "loss": 0.2521, "step": 542 }, { "epoch": 0.6980556001928331, "grad_norm": 0.046767883002758026, "learning_rate": 2e-05, "loss": 0.2113, "step": 543 }, { "epoch": 0.6993411537843484, "grad_norm": 0.05949412286281586, "learning_rate": 2e-05, "loss": 0.3443, "step": 544 }, { "epoch": 0.7006267073758637, "grad_norm": 0.04437008500099182, "learning_rate": 2e-05, "loss": 0.2244, "step": 545 }, { "epoch": 0.7019122609673791, "grad_norm": 0.04240270331501961, "learning_rate": 2e-05, "loss": 0.2239, "step": 546 }, { "epoch": 0.7031978145588944, "grad_norm": 0.04866647720336914, "learning_rate": 2e-05, "loss": 0.2846, "step": 547 }, { "epoch": 0.7044833681504098, "grad_norm": 0.04255237057805061, "learning_rate": 2e-05, "loss": 0.1759, "step": 548 }, { "epoch": 0.7057689217419251, "grad_norm": 0.04113907366991043, "learning_rate": 2e-05, "loss": 0.2481, "step": 549 }, { "epoch": 0.7070544753334405, "grad_norm": 0.04230246692895889, "learning_rate": 2e-05, "loss": 0.1963, "step": 550 }, { "epoch": 0.7083400289249558, "grad_norm": 0.05263131856918335, "learning_rate": 2e-05, "loss": 0.2355, "step": 551 }, { "epoch": 0.7096255825164711, "grad_norm": 0.041025299578905106, "learning_rate": 2e-05, "loss": 0.193, "step": 552 }, { "epoch": 0.7109111361079865, "grad_norm": 0.048196010291576385, "learning_rate": 2e-05, "loss": 0.2183, "step": 553 }, { "epoch": 0.7121966896995019, "grad_norm": 0.05287821963429451, "learning_rate": 2e-05, "loss": 0.2969, "step": 554 }, { "epoch": 0.7134822432910172, "grad_norm": 0.04392276331782341, "learning_rate": 2e-05, "loss": 0.2029, "step": 555 }, { "epoch": 0.7147677968825326, "grad_norm": 0.05237026512622833, "learning_rate": 2e-05, "loss": 0.2653, "step": 556 }, { "epoch": 0.7160533504740478, "grad_norm": 0.05913091078400612, "learning_rate": 2e-05, "loss": 0.2944, "step": 557 }, { "epoch": 0.7173389040655632, "grad_norm": 0.04113471135497093, "learning_rate": 2e-05, "loss": 0.2411, "step": 558 }, { "epoch": 0.7186244576570786, "grad_norm": 0.040105462074279785, "learning_rate": 2e-05, "loss": 0.1857, "step": 559 }, { "epoch": 0.7199100112485939, "grad_norm": 0.058607831597328186, "learning_rate": 2e-05, "loss": 0.1984, "step": 560 }, { "epoch": 0.7211955648401093, "grad_norm": 0.043256357312202454, "learning_rate": 2e-05, "loss": 0.2584, "step": 561 }, { "epoch": 0.7224811184316247, "grad_norm": 0.05908385291695595, "learning_rate": 2e-05, "loss": 0.33, "step": 562 }, { "epoch": 0.7237666720231399, "grad_norm": 0.050697483122348785, "learning_rate": 2e-05, "loss": 0.242, "step": 563 }, { "epoch": 0.7250522256146553, "grad_norm": 0.05611984431743622, "learning_rate": 2e-05, "loss": 0.3334, "step": 564 }, { "epoch": 0.7263377792061707, "grad_norm": 0.05749541521072388, "learning_rate": 2e-05, "loss": 0.2454, "step": 565 }, { "epoch": 0.727623332797686, "grad_norm": 0.05453288555145264, "learning_rate": 2e-05, "loss": 0.249, "step": 566 }, { "epoch": 0.7289088863892014, "grad_norm": 0.061655569821596146, "learning_rate": 2e-05, "loss": 0.2954, "step": 567 }, { "epoch": 0.7301944399807166, "grad_norm": 0.051404744386672974, "learning_rate": 2e-05, "loss": 0.2356, "step": 568 }, { "epoch": 0.731479993572232, "grad_norm": 0.04265725240111351, "learning_rate": 2e-05, "loss": 0.1842, "step": 569 }, { "epoch": 0.7327655471637474, "grad_norm": 0.06363217532634735, "learning_rate": 2e-05, "loss": 0.3187, "step": 570 }, { "epoch": 0.7340511007552627, "grad_norm": 0.04742373526096344, "learning_rate": 2e-05, "loss": 0.2286, "step": 571 }, { "epoch": 0.7353366543467781, "grad_norm": 0.05723915994167328, "learning_rate": 2e-05, "loss": 0.3183, "step": 572 }, { "epoch": 0.7366222079382935, "grad_norm": 0.04636276140809059, "learning_rate": 2e-05, "loss": 0.2172, "step": 573 }, { "epoch": 0.7379077615298087, "grad_norm": 0.041882552206516266, "learning_rate": 2e-05, "loss": 0.195, "step": 574 }, { "epoch": 0.7391933151213241, "grad_norm": 0.05022399127483368, "learning_rate": 2e-05, "loss": 0.2564, "step": 575 }, { "epoch": 0.7404788687128395, "grad_norm": 0.058215439319610596, "learning_rate": 2e-05, "loss": 0.3047, "step": 576 }, { "epoch": 0.7417644223043548, "grad_norm": 0.04993325099349022, "learning_rate": 2e-05, "loss": 0.1955, "step": 577 }, { "epoch": 0.7430499758958702, "grad_norm": 0.05288231745362282, "learning_rate": 2e-05, "loss": 0.3005, "step": 578 }, { "epoch": 0.7443355294873855, "grad_norm": 0.055686481297016144, "learning_rate": 2e-05, "loss": 0.3304, "step": 579 }, { "epoch": 0.7456210830789008, "grad_norm": 0.06084279343485832, "learning_rate": 2e-05, "loss": 0.3377, "step": 580 }, { "epoch": 0.7469066366704162, "grad_norm": 0.041104961186647415, "learning_rate": 2e-05, "loss": 0.2019, "step": 581 }, { "epoch": 0.7481921902619315, "grad_norm": 0.04409842938184738, "learning_rate": 2e-05, "loss": 0.2383, "step": 582 }, { "epoch": 0.7494777438534469, "grad_norm": 0.050962381064891815, "learning_rate": 2e-05, "loss": 0.2439, "step": 583 }, { "epoch": 0.7507632974449623, "grad_norm": 0.05231870710849762, "learning_rate": 2e-05, "loss": 0.2337, "step": 584 }, { "epoch": 0.7520488510364776, "grad_norm": 0.04085131362080574, "learning_rate": 2e-05, "loss": 0.1451, "step": 585 }, { "epoch": 0.753334404627993, "grad_norm": 0.04120944067835808, "learning_rate": 2e-05, "loss": 0.2029, "step": 586 }, { "epoch": 0.7546199582195082, "grad_norm": 0.0363801047205925, "learning_rate": 2e-05, "loss": 0.1393, "step": 587 }, { "epoch": 0.7559055118110236, "grad_norm": 0.04919865354895592, "learning_rate": 2e-05, "loss": 0.2308, "step": 588 }, { "epoch": 0.757191065402539, "grad_norm": 0.0516657792031765, "learning_rate": 2e-05, "loss": 0.3006, "step": 589 }, { "epoch": 0.7584766189940543, "grad_norm": 0.07350458204746246, "learning_rate": 2e-05, "loss": 0.3796, "step": 590 }, { "epoch": 0.7597621725855697, "grad_norm": 0.05353572219610214, "learning_rate": 2e-05, "loss": 0.2548, "step": 591 }, { "epoch": 0.761047726177085, "grad_norm": 0.04492725431919098, "learning_rate": 2e-05, "loss": 0.199, "step": 592 }, { "epoch": 0.7623332797686003, "grad_norm": 0.04892539232969284, "learning_rate": 2e-05, "loss": 0.2108, "step": 593 }, { "epoch": 0.7636188333601157, "grad_norm": 0.03860924020409584, "learning_rate": 2e-05, "loss": 0.1896, "step": 594 }, { "epoch": 0.7649043869516311, "grad_norm": 0.052807312458753586, "learning_rate": 2e-05, "loss": 0.2709, "step": 595 }, { "epoch": 0.7661899405431464, "grad_norm": 0.04871145263314247, "learning_rate": 2e-05, "loss": 0.2779, "step": 596 }, { "epoch": 0.7674754941346618, "grad_norm": 0.04021324962377548, "learning_rate": 2e-05, "loss": 0.2136, "step": 597 }, { "epoch": 0.768761047726177, "grad_norm": 0.050265613943338394, "learning_rate": 2e-05, "loss": 0.2601, "step": 598 }, { "epoch": 0.7700466013176924, "grad_norm": 0.03576774150133133, "learning_rate": 2e-05, "loss": 0.2114, "step": 599 }, { "epoch": 0.7713321549092078, "grad_norm": 0.055398743599653244, "learning_rate": 2e-05, "loss": 0.2701, "step": 600 }, { "epoch": 0.7726177085007231, "grad_norm": 0.06506812572479248, "learning_rate": 2e-05, "loss": 0.3518, "step": 601 }, { "epoch": 0.7739032620922385, "grad_norm": 0.037148088216781616, "learning_rate": 2e-05, "loss": 0.1438, "step": 602 }, { "epoch": 0.7751888156837539, "grad_norm": 0.046173613518476486, "learning_rate": 2e-05, "loss": 0.2294, "step": 603 }, { "epoch": 0.7764743692752691, "grad_norm": 0.06617863476276398, "learning_rate": 2e-05, "loss": 0.2888, "step": 604 }, { "epoch": 0.7777599228667845, "grad_norm": 0.051207173615694046, "learning_rate": 2e-05, "loss": 0.2624, "step": 605 }, { "epoch": 0.7790454764582998, "grad_norm": 0.041766516864299774, "learning_rate": 2e-05, "loss": 0.1881, "step": 606 }, { "epoch": 0.7803310300498152, "grad_norm": 0.05160610005259514, "learning_rate": 2e-05, "loss": 0.258, "step": 607 }, { "epoch": 0.7816165836413306, "grad_norm": 0.04584109038114548, "learning_rate": 2e-05, "loss": 0.2087, "step": 608 }, { "epoch": 0.7829021372328459, "grad_norm": 0.04200456291437149, "learning_rate": 2e-05, "loss": 0.2036, "step": 609 }, { "epoch": 0.7841876908243612, "grad_norm": 0.039162181317806244, "learning_rate": 2e-05, "loss": 0.1833, "step": 610 }, { "epoch": 0.7854732444158766, "grad_norm": 0.041861940175294876, "learning_rate": 2e-05, "loss": 0.1623, "step": 611 }, { "epoch": 0.7867587980073919, "grad_norm": 0.05622352659702301, "learning_rate": 2e-05, "loss": 0.3556, "step": 612 }, { "epoch": 0.7880443515989073, "grad_norm": 0.048621952533721924, "learning_rate": 2e-05, "loss": 0.2211, "step": 613 }, { "epoch": 0.7893299051904227, "grad_norm": 0.0437590628862381, "learning_rate": 2e-05, "loss": 0.2015, "step": 614 }, { "epoch": 0.790615458781938, "grad_norm": 0.05675414949655533, "learning_rate": 2e-05, "loss": 0.2416, "step": 615 }, { "epoch": 0.7919010123734533, "grad_norm": 0.03869640827178955, "learning_rate": 2e-05, "loss": 0.1655, "step": 616 }, { "epoch": 0.7931865659649686, "grad_norm": 0.04821722209453583, "learning_rate": 2e-05, "loss": 0.1902, "step": 617 }, { "epoch": 0.794472119556484, "grad_norm": 0.04423803463578224, "learning_rate": 2e-05, "loss": 0.157, "step": 618 }, { "epoch": 0.7957576731479994, "grad_norm": 0.04364867880940437, "learning_rate": 2e-05, "loss": 0.2406, "step": 619 }, { "epoch": 0.7970432267395147, "grad_norm": 0.059711892157793045, "learning_rate": 2e-05, "loss": 0.2981, "step": 620 }, { "epoch": 0.79832878033103, "grad_norm": 0.046063173562288284, "learning_rate": 2e-05, "loss": 0.2184, "step": 621 }, { "epoch": 0.7996143339225454, "grad_norm": 0.06073896959424019, "learning_rate": 2e-05, "loss": 0.2351, "step": 622 }, { "epoch": 0.8008998875140607, "grad_norm": 0.039248064160346985, "learning_rate": 2e-05, "loss": 0.1888, "step": 623 }, { "epoch": 0.8021854411055761, "grad_norm": 0.05402129143476486, "learning_rate": 2e-05, "loss": 0.3368, "step": 624 }, { "epoch": 0.8034709946970915, "grad_norm": 0.04230786859989166, "learning_rate": 2e-05, "loss": 0.1748, "step": 625 }, { "epoch": 0.8047565482886068, "grad_norm": 0.06045274809002876, "learning_rate": 2e-05, "loss": 0.3958, "step": 626 }, { "epoch": 0.8060421018801222, "grad_norm": 0.04717743769288063, "learning_rate": 2e-05, "loss": 0.2704, "step": 627 }, { "epoch": 0.8073276554716374, "grad_norm": 0.04878292232751846, "learning_rate": 2e-05, "loss": 0.2412, "step": 628 }, { "epoch": 0.8086132090631528, "grad_norm": 0.038947124034166336, "learning_rate": 2e-05, "loss": 0.2169, "step": 629 }, { "epoch": 0.8098987626546682, "grad_norm": 0.0614759586751461, "learning_rate": 2e-05, "loss": 0.3013, "step": 630 }, { "epoch": 0.8111843162461835, "grad_norm": 0.06246621906757355, "learning_rate": 2e-05, "loss": 0.2947, "step": 631 }, { "epoch": 0.8124698698376989, "grad_norm": 0.06976212561130524, "learning_rate": 2e-05, "loss": 0.297, "step": 632 }, { "epoch": 0.8137554234292143, "grad_norm": 0.03317941352725029, "learning_rate": 2e-05, "loss": 0.1375, "step": 633 }, { "epoch": 0.8150409770207295, "grad_norm": 0.06765579432249069, "learning_rate": 2e-05, "loss": 0.258, "step": 634 }, { "epoch": 0.8163265306122449, "grad_norm": 0.06797792762517929, "learning_rate": 2e-05, "loss": 0.256, "step": 635 }, { "epoch": 0.8176120842037602, "grad_norm": 0.059785496443510056, "learning_rate": 2e-05, "loss": 0.3343, "step": 636 }, { "epoch": 0.8188976377952756, "grad_norm": 0.059780728071928024, "learning_rate": 2e-05, "loss": 0.3634, "step": 637 }, { "epoch": 0.820183191386791, "grad_norm": 0.04111599549651146, "learning_rate": 2e-05, "loss": 0.2011, "step": 638 }, { "epoch": 0.8214687449783062, "grad_norm": 0.04656028002500534, "learning_rate": 2e-05, "loss": 0.2214, "step": 639 }, { "epoch": 0.8227542985698216, "grad_norm": 0.054362326860427856, "learning_rate": 2e-05, "loss": 0.2928, "step": 640 }, { "epoch": 0.824039852161337, "grad_norm": 0.04594152048230171, "learning_rate": 2e-05, "loss": 0.2285, "step": 641 }, { "epoch": 0.8253254057528523, "grad_norm": 0.056715745478868484, "learning_rate": 2e-05, "loss": 0.2531, "step": 642 }, { "epoch": 0.8266109593443677, "grad_norm": 0.049057237803936005, "learning_rate": 2e-05, "loss": 0.1749, "step": 643 }, { "epoch": 0.8278965129358831, "grad_norm": 0.05435045436024666, "learning_rate": 2e-05, "loss": 0.2796, "step": 644 }, { "epoch": 0.8291820665273983, "grad_norm": 0.049284275621175766, "learning_rate": 2e-05, "loss": 0.2381, "step": 645 }, { "epoch": 0.8304676201189137, "grad_norm": 0.044050633907318115, "learning_rate": 2e-05, "loss": 0.2804, "step": 646 }, { "epoch": 0.831753173710429, "grad_norm": 0.054185982793569565, "learning_rate": 2e-05, "loss": 0.2617, "step": 647 }, { "epoch": 0.8330387273019444, "grad_norm": 0.0534062534570694, "learning_rate": 2e-05, "loss": 0.2502, "step": 648 }, { "epoch": 0.8343242808934598, "grad_norm": 0.06242300197482109, "learning_rate": 2e-05, "loss": 0.2662, "step": 649 }, { "epoch": 0.8356098344849751, "grad_norm": 0.0385594442486763, "learning_rate": 2e-05, "loss": 0.1897, "step": 650 }, { "epoch": 0.8368953880764904, "grad_norm": 0.065641388297081, "learning_rate": 2e-05, "loss": 0.3179, "step": 651 }, { "epoch": 0.8381809416680058, "grad_norm": 0.054985061287879944, "learning_rate": 2e-05, "loss": 0.222, "step": 652 }, { "epoch": 0.8394664952595211, "grad_norm": 0.05766449496150017, "learning_rate": 2e-05, "loss": 0.289, "step": 653 }, { "epoch": 0.8407520488510365, "grad_norm": 0.04635515809059143, "learning_rate": 2e-05, "loss": 0.2464, "step": 654 }, { "epoch": 0.8420376024425518, "grad_norm": 0.0583229660987854, "learning_rate": 2e-05, "loss": 0.2436, "step": 655 }, { "epoch": 0.8433231560340672, "grad_norm": 0.04983345419168472, "learning_rate": 2e-05, "loss": 0.2534, "step": 656 }, { "epoch": 0.8446087096255825, "grad_norm": 0.04292474314570427, "learning_rate": 2e-05, "loss": 0.1772, "step": 657 }, { "epoch": 0.8458942632170978, "grad_norm": 0.05735989660024643, "learning_rate": 2e-05, "loss": 0.267, "step": 658 }, { "epoch": 0.8471798168086132, "grad_norm": 0.055415477603673935, "learning_rate": 2e-05, "loss": 0.2651, "step": 659 }, { "epoch": 0.8484653704001286, "grad_norm": 0.052020199596881866, "learning_rate": 2e-05, "loss": 0.2177, "step": 660 }, { "epoch": 0.8497509239916439, "grad_norm": 0.05934329703450203, "learning_rate": 2e-05, "loss": 0.2665, "step": 661 }, { "epoch": 0.8510364775831593, "grad_norm": 0.06611707806587219, "learning_rate": 2e-05, "loss": 0.3774, "step": 662 }, { "epoch": 0.8523220311746746, "grad_norm": 0.05337178707122803, "learning_rate": 2e-05, "loss": 0.2699, "step": 663 }, { "epoch": 0.8536075847661899, "grad_norm": 0.05552757531404495, "learning_rate": 2e-05, "loss": 0.2204, "step": 664 }, { "epoch": 0.8548931383577053, "grad_norm": 0.051326069980859756, "learning_rate": 2e-05, "loss": 0.1791, "step": 665 }, { "epoch": 0.8561786919492206, "grad_norm": 0.04780028760433197, "learning_rate": 2e-05, "loss": 0.1959, "step": 666 }, { "epoch": 0.857464245540736, "grad_norm": 0.06344909965991974, "learning_rate": 2e-05, "loss": 0.2809, "step": 667 }, { "epoch": 0.8587497991322514, "grad_norm": 0.0526767373085022, "learning_rate": 2e-05, "loss": 0.2547, "step": 668 }, { "epoch": 0.8600353527237666, "grad_norm": 0.04369194433093071, "learning_rate": 2e-05, "loss": 0.233, "step": 669 }, { "epoch": 0.861320906315282, "grad_norm": 0.05023709312081337, "learning_rate": 2e-05, "loss": 0.2576, "step": 670 }, { "epoch": 0.8626064599067974, "grad_norm": 0.06402754783630371, "learning_rate": 2e-05, "loss": 0.2579, "step": 671 }, { "epoch": 0.8638920134983127, "grad_norm": 0.06747744977474213, "learning_rate": 2e-05, "loss": 0.393, "step": 672 }, { "epoch": 0.8651775670898281, "grad_norm": 0.06799997389316559, "learning_rate": 2e-05, "loss": 0.3114, "step": 673 }, { "epoch": 0.8664631206813435, "grad_norm": 0.044738415628671646, "learning_rate": 2e-05, "loss": 0.222, "step": 674 }, { "epoch": 0.8677486742728587, "grad_norm": 0.05913526564836502, "learning_rate": 2e-05, "loss": 0.2701, "step": 675 }, { "epoch": 0.8690342278643741, "grad_norm": 0.052639495581388474, "learning_rate": 2e-05, "loss": 0.2279, "step": 676 }, { "epoch": 0.8703197814558894, "grad_norm": 0.0436641164124012, "learning_rate": 2e-05, "loss": 0.1722, "step": 677 }, { "epoch": 0.8716053350474048, "grad_norm": 0.06275106966495514, "learning_rate": 2e-05, "loss": 0.3289, "step": 678 }, { "epoch": 0.8728908886389202, "grad_norm": 0.034002162516117096, "learning_rate": 2e-05, "loss": 0.1262, "step": 679 }, { "epoch": 0.8741764422304354, "grad_norm": 0.04524555802345276, "learning_rate": 2e-05, "loss": 0.1765, "step": 680 }, { "epoch": 0.8754619958219508, "grad_norm": 0.04776989668607712, "learning_rate": 2e-05, "loss": 0.242, "step": 681 }, { "epoch": 0.8767475494134662, "grad_norm": 0.060143712908029556, "learning_rate": 2e-05, "loss": 0.24, "step": 682 }, { "epoch": 0.8780331030049815, "grad_norm": 0.06363454461097717, "learning_rate": 2e-05, "loss": 0.3104, "step": 683 }, { "epoch": 0.8793186565964969, "grad_norm": 0.05736486613750458, "learning_rate": 2e-05, "loss": 0.3299, "step": 684 }, { "epoch": 0.8806042101880122, "grad_norm": 0.048391181975603104, "learning_rate": 2e-05, "loss": 0.1937, "step": 685 }, { "epoch": 0.8818897637795275, "grad_norm": 0.047165125608444214, "learning_rate": 2e-05, "loss": 0.2608, "step": 686 }, { "epoch": 0.8831753173710429, "grad_norm": 0.061681345105171204, "learning_rate": 2e-05, "loss": 0.2948, "step": 687 }, { "epoch": 0.8844608709625582, "grad_norm": 0.060136910527944565, "learning_rate": 2e-05, "loss": 0.2272, "step": 688 }, { "epoch": 0.8857464245540736, "grad_norm": 0.047498807311058044, "learning_rate": 2e-05, "loss": 0.1813, "step": 689 }, { "epoch": 0.887031978145589, "grad_norm": 0.06447866559028625, "learning_rate": 2e-05, "loss": 0.2808, "step": 690 }, { "epoch": 0.8883175317371043, "grad_norm": 0.05992686748504639, "learning_rate": 2e-05, "loss": 0.262, "step": 691 }, { "epoch": 0.8896030853286196, "grad_norm": 0.048196423798799515, "learning_rate": 2e-05, "loss": 0.2238, "step": 692 }, { "epoch": 0.890888638920135, "grad_norm": 0.06860709935426712, "learning_rate": 2e-05, "loss": 0.2679, "step": 693 }, { "epoch": 0.8921741925116503, "grad_norm": 0.05085690692067146, "learning_rate": 2e-05, "loss": 0.2948, "step": 694 }, { "epoch": 0.8934597461031657, "grad_norm": 0.06869999319314957, "learning_rate": 2e-05, "loss": 0.2961, "step": 695 }, { "epoch": 0.894745299694681, "grad_norm": 0.04691535234451294, "learning_rate": 2e-05, "loss": 0.2019, "step": 696 }, { "epoch": 0.8960308532861964, "grad_norm": 0.04785510525107384, "learning_rate": 2e-05, "loss": 0.147, "step": 697 }, { "epoch": 0.8973164068777117, "grad_norm": 0.06156083196401596, "learning_rate": 2e-05, "loss": 0.2215, "step": 698 }, { "epoch": 0.898601960469227, "grad_norm": 0.051647745072841644, "learning_rate": 2e-05, "loss": 0.2252, "step": 699 }, { "epoch": 0.8998875140607424, "grad_norm": 0.04751814156770706, "learning_rate": 2e-05, "loss": 0.2482, "step": 700 }, { "epoch": 0.9011730676522578, "grad_norm": 0.05452054366469383, "learning_rate": 2e-05, "loss": 0.2138, "step": 701 }, { "epoch": 0.9024586212437731, "grad_norm": 0.045277033001184464, "learning_rate": 2e-05, "loss": 0.2148, "step": 702 }, { "epoch": 0.9037441748352885, "grad_norm": 0.045462466776371, "learning_rate": 2e-05, "loss": 0.1711, "step": 703 }, { "epoch": 0.9050297284268038, "grad_norm": 0.06722573935985565, "learning_rate": 2e-05, "loss": 0.3205, "step": 704 }, { "epoch": 0.9063152820183191, "grad_norm": 0.05163208395242691, "learning_rate": 2e-05, "loss": 0.247, "step": 705 }, { "epoch": 0.9076008356098345, "grad_norm": 0.052614837884902954, "learning_rate": 2e-05, "loss": 0.2002, "step": 706 }, { "epoch": 0.9088863892013498, "grad_norm": 0.03826769068837166, "learning_rate": 2e-05, "loss": 0.1744, "step": 707 }, { "epoch": 0.9101719427928652, "grad_norm": 0.04780410975217819, "learning_rate": 2e-05, "loss": 0.2524, "step": 708 }, { "epoch": 0.9114574963843806, "grad_norm": 0.03547963872551918, "learning_rate": 2e-05, "loss": 0.1674, "step": 709 }, { "epoch": 0.9127430499758958, "grad_norm": 0.0573282465338707, "learning_rate": 2e-05, "loss": 0.2749, "step": 710 }, { "epoch": 0.9140286035674112, "grad_norm": 0.0570538304746151, "learning_rate": 2e-05, "loss": 0.2412, "step": 711 }, { "epoch": 0.9153141571589266, "grad_norm": 0.054683949798345566, "learning_rate": 2e-05, "loss": 0.2537, "step": 712 }, { "epoch": 0.9165997107504419, "grad_norm": 0.05413772165775299, "learning_rate": 2e-05, "loss": 0.2314, "step": 713 }, { "epoch": 0.9178852643419573, "grad_norm": 0.05124877020716667, "learning_rate": 2e-05, "loss": 0.2645, "step": 714 }, { "epoch": 0.9191708179334726, "grad_norm": 0.06577921658754349, "learning_rate": 2e-05, "loss": 0.314, "step": 715 }, { "epoch": 0.9204563715249879, "grad_norm": 0.05663186311721802, "learning_rate": 2e-05, "loss": 0.2422, "step": 716 }, { "epoch": 0.9217419251165033, "grad_norm": 0.05851929262280464, "learning_rate": 2e-05, "loss": 0.2845, "step": 717 }, { "epoch": 0.9230274787080186, "grad_norm": 0.06582541763782501, "learning_rate": 2e-05, "loss": 0.2487, "step": 718 }, { "epoch": 0.924313032299534, "grad_norm": 0.0434844084084034, "learning_rate": 2e-05, "loss": 0.191, "step": 719 }, { "epoch": 0.9255985858910494, "grad_norm": 0.056996386498212814, "learning_rate": 2e-05, "loss": 0.2733, "step": 720 }, { "epoch": 0.9268841394825647, "grad_norm": 0.04399803280830383, "learning_rate": 2e-05, "loss": 0.1991, "step": 721 }, { "epoch": 0.92816969307408, "grad_norm": 0.047656819224357605, "learning_rate": 2e-05, "loss": 0.2274, "step": 722 }, { "epoch": 0.9294552466655954, "grad_norm": 0.0753135085105896, "learning_rate": 2e-05, "loss": 0.3748, "step": 723 }, { "epoch": 0.9307408002571107, "grad_norm": 0.07544931024312973, "learning_rate": 2e-05, "loss": 0.282, "step": 724 }, { "epoch": 0.9320263538486261, "grad_norm": 0.05577397346496582, "learning_rate": 2e-05, "loss": 0.273, "step": 725 }, { "epoch": 0.9333119074401414, "grad_norm": 0.039960604161024094, "learning_rate": 2e-05, "loss": 0.1423, "step": 726 }, { "epoch": 0.9345974610316568, "grad_norm": 0.0625922679901123, "learning_rate": 2e-05, "loss": 0.2504, "step": 727 }, { "epoch": 0.9358830146231721, "grad_norm": 0.060125015676021576, "learning_rate": 2e-05, "loss": 0.2061, "step": 728 }, { "epoch": 0.9371685682146874, "grad_norm": 0.06697895377874374, "learning_rate": 2e-05, "loss": 0.2672, "step": 729 }, { "epoch": 0.9384541218062028, "grad_norm": 0.09079831093549728, "learning_rate": 2e-05, "loss": 0.3944, "step": 730 }, { "epoch": 0.9397396753977182, "grad_norm": 0.05246804282069206, "learning_rate": 2e-05, "loss": 0.2153, "step": 731 }, { "epoch": 0.9410252289892335, "grad_norm": 0.03938793018460274, "learning_rate": 2e-05, "loss": 0.1496, "step": 732 }, { "epoch": 0.9423107825807489, "grad_norm": 0.05081872642040253, "learning_rate": 2e-05, "loss": 0.1939, "step": 733 }, { "epoch": 0.9435963361722641, "grad_norm": 0.055075064301490784, "learning_rate": 2e-05, "loss": 0.2314, "step": 734 }, { "epoch": 0.9448818897637795, "grad_norm": 0.057048946619033813, "learning_rate": 2e-05, "loss": 0.2258, "step": 735 }, { "epoch": 0.9461674433552949, "grad_norm": 0.0564640611410141, "learning_rate": 2e-05, "loss": 0.221, "step": 736 }, { "epoch": 0.9474529969468102, "grad_norm": 0.06246118247509003, "learning_rate": 2e-05, "loss": 0.2655, "step": 737 }, { "epoch": 0.9487385505383256, "grad_norm": 0.06543996930122375, "learning_rate": 2e-05, "loss": 0.3487, "step": 738 }, { "epoch": 0.950024104129841, "grad_norm": 0.05123418942093849, "learning_rate": 2e-05, "loss": 0.2593, "step": 739 }, { "epoch": 0.9513096577213562, "grad_norm": 0.04761409014463425, "learning_rate": 2e-05, "loss": 0.1717, "step": 740 }, { "epoch": 0.9525952113128716, "grad_norm": 0.05747079476714134, "learning_rate": 2e-05, "loss": 0.2239, "step": 741 }, { "epoch": 0.953880764904387, "grad_norm": 0.04854227229952812, "learning_rate": 2e-05, "loss": 0.1742, "step": 742 }, { "epoch": 0.9551663184959023, "grad_norm": 0.05784037709236145, "learning_rate": 2e-05, "loss": 0.203, "step": 743 }, { "epoch": 0.9564518720874177, "grad_norm": 0.05370228737592697, "learning_rate": 2e-05, "loss": 0.255, "step": 744 }, { "epoch": 0.9577374256789329, "grad_norm": 0.04535800218582153, "learning_rate": 2e-05, "loss": 0.1951, "step": 745 }, { "epoch": 0.9590229792704483, "grad_norm": 0.044412512332201004, "learning_rate": 2e-05, "loss": 0.2087, "step": 746 }, { "epoch": 0.9603085328619637, "grad_norm": 0.05077359825372696, "learning_rate": 2e-05, "loss": 0.19, "step": 747 }, { "epoch": 0.961594086453479, "grad_norm": 0.056578539311885834, "learning_rate": 2e-05, "loss": 0.2784, "step": 748 }, { "epoch": 0.9628796400449944, "grad_norm": 0.04252656549215317, "learning_rate": 2e-05, "loss": 0.239, "step": 749 }, { "epoch": 0.9641651936365098, "grad_norm": 0.04754233360290527, "learning_rate": 2e-05, "loss": 0.1871, "step": 750 }, { "epoch": 0.965450747228025, "grad_norm": 0.04948977380990982, "learning_rate": 2e-05, "loss": 0.2095, "step": 751 }, { "epoch": 0.9667363008195404, "grad_norm": 0.056569986045360565, "learning_rate": 2e-05, "loss": 0.1627, "step": 752 }, { "epoch": 0.9680218544110558, "grad_norm": 0.058012060821056366, "learning_rate": 2e-05, "loss": 0.277, "step": 753 }, { "epoch": 0.9693074080025711, "grad_norm": 0.06445303559303284, "learning_rate": 2e-05, "loss": 0.3453, "step": 754 }, { "epoch": 0.9705929615940865, "grad_norm": 0.04822942987084389, "learning_rate": 2e-05, "loss": 0.1958, "step": 755 }, { "epoch": 0.9718785151856018, "grad_norm": 0.04951447993516922, "learning_rate": 2e-05, "loss": 0.2342, "step": 756 }, { "epoch": 0.9731640687771171, "grad_norm": 0.04779404401779175, "learning_rate": 2e-05, "loss": 0.2277, "step": 757 }, { "epoch": 0.9744496223686325, "grad_norm": 0.047998420894145966, "learning_rate": 2e-05, "loss": 0.1817, "step": 758 }, { "epoch": 0.9757351759601478, "grad_norm": 0.050718434154987335, "learning_rate": 2e-05, "loss": 0.2289, "step": 759 }, { "epoch": 0.9770207295516632, "grad_norm": 0.05427386984229088, "learning_rate": 2e-05, "loss": 0.2597, "step": 760 }, { "epoch": 0.9783062831431786, "grad_norm": 0.06047537922859192, "learning_rate": 2e-05, "loss": 0.2597, "step": 761 }, { "epoch": 0.9795918367346939, "grad_norm": 0.048412878066301346, "learning_rate": 2e-05, "loss": 0.2386, "step": 762 }, { "epoch": 0.9808773903262092, "grad_norm": 0.04905233159661293, "learning_rate": 2e-05, "loss": 0.2239, "step": 763 }, { "epoch": 0.9821629439177245, "grad_norm": 0.052379023283720016, "learning_rate": 2e-05, "loss": 0.263, "step": 764 }, { "epoch": 0.9834484975092399, "grad_norm": 0.0489642396569252, "learning_rate": 2e-05, "loss": 0.225, "step": 765 }, { "epoch": 0.9847340511007553, "grad_norm": 0.050984520465135574, "learning_rate": 2e-05, "loss": 0.219, "step": 766 }, { "epoch": 0.9860196046922706, "grad_norm": 0.05487053468823433, "learning_rate": 2e-05, "loss": 0.1788, "step": 767 }, { "epoch": 0.987305158283786, "grad_norm": 0.06488880515098572, "learning_rate": 2e-05, "loss": 0.2994, "step": 768 }, { "epoch": 0.9885907118753013, "grad_norm": 0.057233408093452454, "learning_rate": 2e-05, "loss": 0.3028, "step": 769 }, { "epoch": 0.9898762654668166, "grad_norm": 0.03885122016072273, "learning_rate": 2e-05, "loss": 0.1704, "step": 770 }, { "epoch": 0.991161819058332, "grad_norm": 0.04395405203104019, "learning_rate": 2e-05, "loss": 0.168, "step": 771 }, { "epoch": 0.9924473726498474, "grad_norm": 0.07156252861022949, "learning_rate": 2e-05, "loss": 0.3431, "step": 772 }, { "epoch": 0.9937329262413627, "grad_norm": 0.05737178027629852, "learning_rate": 2e-05, "loss": 0.2595, "step": 773 }, { "epoch": 0.9950184798328781, "grad_norm": 0.0596122108399868, "learning_rate": 2e-05, "loss": 0.2177, "step": 774 }, { "epoch": 0.9963040334243933, "grad_norm": 0.0480956956744194, "learning_rate": 2e-05, "loss": 0.2008, "step": 775 }, { "epoch": 0.9975895870159087, "grad_norm": 0.045857105404138565, "learning_rate": 2e-05, "loss": 0.2093, "step": 776 }, { "epoch": 0.9988751406074241, "grad_norm": 0.05208531767129898, "learning_rate": 2e-05, "loss": 0.1512, "step": 777 }, { "epoch": 0.9988751406074241, "step": 777, "total_flos": 525522702336000.0, "train_loss": 0.3197911096739186, "train_runtime": 4432.7229, "train_samples_per_second": 5.616, "train_steps_per_second": 0.175 } ], "logging_steps": 1.0, "max_steps": 777, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100.0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 525522702336000.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }