diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,5481 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9988751406074241, + "eval_steps": 500, + "global_step": 777, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0012855535915153463, + "grad_norm": 0.020836442708969116, + "learning_rate": 0.0, + "loss": 0.6262, + "step": 1 + }, + { + "epoch": 0.0025711071830306926, + "grad_norm": 0.022134091705083847, + "learning_rate": 4.075900941810124e-06, + "loss": 0.8688, + "step": 2 + }, + { + "epoch": 0.003856660774546039, + "grad_norm": 0.023200005292892456, + "learning_rate": 6.46015014942309e-06, + "loss": 0.7864, + "step": 3 + }, + { + "epoch": 0.005142214366061385, + "grad_norm": 0.02313530258834362, + "learning_rate": 8.151801883620247e-06, + "loss": 0.8897, + "step": 4 + }, + { + "epoch": 0.0064277679575767315, + "grad_norm": 0.020561356097459793, + "learning_rate": 9.463948908766788e-06, + "loss": 0.6479, + "step": 5 + }, + { + "epoch": 0.007713321549092078, + "grad_norm": 0.021870166063308716, + "learning_rate": 1.0536051091233212e-05, + "loss": 0.7501, + "step": 6 + }, + { + "epoch": 0.008998875140607425, + "grad_norm": 0.023460067808628082, + "learning_rate": 1.1442500570809876e-05, + "loss": 0.8672, + "step": 7 + }, + { + "epoch": 0.01028442873212277, + "grad_norm": 0.02368471957743168, + "learning_rate": 1.222770282543037e-05, + "loss": 0.8984, + "step": 8 + }, + { + "epoch": 0.011569982323638118, + "grad_norm": 0.020765064284205437, + "learning_rate": 1.292030029884618e-05, + "loss": 0.6547, + "step": 9 + }, + { + "epoch": 0.012855535915153463, + "grad_norm": 0.023947741836309433, + "learning_rate": 1.3539849850576912e-05, + "loss": 0.8205, + "step": 10 + }, + { + "epoch": 0.01414108950666881, + "grad_norm": 0.028013406321406364, + "learning_rate": 1.4100300592531481e-05, + "loss": 0.7891, + "step": 11 + }, + { + "epoch": 0.015426643098184156, + "grad_norm": 0.027111703529953957, + "learning_rate": 1.4611952033043337e-05, + "loss": 0.7678, + "step": 12 + }, + { + "epoch": 0.0167121966896995, + "grad_norm": 0.028518904000520706, + "learning_rate": 1.5082625732282867e-05, + "loss": 0.8091, + "step": 13 + }, + { + "epoch": 0.01799775028121485, + "grad_norm": 0.022600186988711357, + "learning_rate": 1.551840151262e-05, + "loss": 0.5875, + "step": 14 + }, + { + "epoch": 0.019283303872730195, + "grad_norm": 0.021580247208476067, + "learning_rate": 1.5924099058189875e-05, + "loss": 0.6714, + "step": 15 + }, + { + "epoch": 0.02056885746424554, + "grad_norm": 0.02505405619740486, + "learning_rate": 1.6303603767240495e-05, + "loss": 0.7534, + "step": 16 + }, + { + "epoch": 0.021854411055760886, + "grad_norm": 0.024437466636300087, + "learning_rate": 1.6660093644266146e-05, + "loss": 0.6945, + "step": 17 + }, + { + "epoch": 0.023139964647276235, + "grad_norm": 0.028052283450961113, + "learning_rate": 1.6996201240656302e-05, + "loss": 0.7076, + "step": 18 + }, + { + "epoch": 0.02442551823879158, + "grad_norm": 0.03632762283086777, + "learning_rate": 1.7314131752785847e-05, + "loss": 0.769, + "step": 19 + }, + { + "epoch": 0.025711071830306926, + "grad_norm": 0.02896072156727314, + "learning_rate": 1.7615750792387035e-05, + "loss": 0.8087, + "step": 20 + }, + { + "epoch": 0.02699662542182227, + "grad_norm": 0.034198954701423645, + "learning_rate": 1.7902650720232966e-05, + "loss": 0.8161, + "step": 21 + }, + { + "epoch": 0.02828217901333762, + "grad_norm": 0.03110469877719879, + "learning_rate": 1.8176201534341607e-05, + "loss": 0.8253, + "step": 22 + }, + { + "epoch": 0.029567732604852966, + "grad_norm": 0.039295781403779984, + "learning_rate": 1.8437590437029225e-05, + "loss": 0.9744, + "step": 23 + }, + { + "epoch": 0.03085328619636831, + "grad_norm": 0.03249296918511391, + "learning_rate": 1.868785297485346e-05, + "loss": 0.6455, + "step": 24 + }, + { + "epoch": 0.032138839787883657, + "grad_norm": 0.03106599487364292, + "learning_rate": 1.8927897817533575e-05, + "loss": 0.7005, + "step": 25 + }, + { + "epoch": 0.033424393379399, + "grad_norm": 0.03536655381321907, + "learning_rate": 1.915852667409299e-05, + "loss": 0.8004, + "step": 26 + }, + { + "epoch": 0.03470994697091435, + "grad_norm": 0.035472676157951355, + "learning_rate": 1.9380450448269272e-05, + "loss": 0.675, + "step": 27 + }, + { + "epoch": 0.0359955005624297, + "grad_norm": 0.03877939283847809, + "learning_rate": 1.9594302454430122e-05, + "loss": 0.6278, + "step": 28 + }, + { + "epoch": 0.037281054153945045, + "grad_norm": 0.041341230273246765, + "learning_rate": 1.9800649313336155e-05, + "loss": 0.914, + "step": 29 + }, + { + "epoch": 0.03856660774546039, + "grad_norm": 0.042063791304826736, + "learning_rate": 2e-05, + "loss": 0.7664, + "step": 30 + }, + { + "epoch": 0.039852161336975736, + "grad_norm": 0.04166961461305618, + "learning_rate": 2e-05, + "loss": 0.6324, + "step": 31 + }, + { + "epoch": 0.04113771492849108, + "grad_norm": 0.04256080463528633, + "learning_rate": 2e-05, + "loss": 0.6771, + "step": 32 + }, + { + "epoch": 0.04242326852000643, + "grad_norm": 0.042959265410900116, + "learning_rate": 2e-05, + "loss": 0.6018, + "step": 33 + }, + { + "epoch": 0.04370882211152177, + "grad_norm": 0.03880544751882553, + "learning_rate": 2e-05, + "loss": 0.5819, + "step": 34 + }, + { + "epoch": 0.04499437570303712, + "grad_norm": 0.0412827730178833, + "learning_rate": 2e-05, + "loss": 0.8133, + "step": 35 + }, + { + "epoch": 0.04627992929455247, + "grad_norm": 0.04274650663137436, + "learning_rate": 2e-05, + "loss": 0.6237, + "step": 36 + }, + { + "epoch": 0.047565482886067816, + "grad_norm": 0.04136871546506882, + "learning_rate": 2e-05, + "loss": 0.5851, + "step": 37 + }, + { + "epoch": 0.04885103647758316, + "grad_norm": 0.04220248758792877, + "learning_rate": 2e-05, + "loss": 0.7603, + "step": 38 + }, + { + "epoch": 0.050136590069098506, + "grad_norm": 0.039129678159952164, + "learning_rate": 2e-05, + "loss": 0.5206, + "step": 39 + }, + { + "epoch": 0.05142214366061385, + "grad_norm": 0.04173429682850838, + "learning_rate": 2e-05, + "loss": 0.6602, + "step": 40 + }, + { + "epoch": 0.0527076972521292, + "grad_norm": 0.040010105818510056, + "learning_rate": 2e-05, + "loss": 0.5964, + "step": 41 + }, + { + "epoch": 0.05399325084364454, + "grad_norm": 0.03841459006071091, + "learning_rate": 2e-05, + "loss": 0.6162, + "step": 42 + }, + { + "epoch": 0.05527880443515989, + "grad_norm": 0.04042840003967285, + "learning_rate": 2e-05, + "loss": 0.535, + "step": 43 + }, + { + "epoch": 0.05656435802667524, + "grad_norm": 0.040401577949523926, + "learning_rate": 2e-05, + "loss": 0.5685, + "step": 44 + }, + { + "epoch": 0.057849911618190586, + "grad_norm": 0.06742753833532333, + "learning_rate": 2e-05, + "loss": 0.8276, + "step": 45 + }, + { + "epoch": 0.05913546520970593, + "grad_norm": 0.040345244109630585, + "learning_rate": 2e-05, + "loss": 0.5988, + "step": 46 + }, + { + "epoch": 0.06042101880122128, + "grad_norm": 0.0415828563272953, + "learning_rate": 2e-05, + "loss": 0.6151, + "step": 47 + }, + { + "epoch": 0.06170657239273662, + "grad_norm": 0.041223231703042984, + "learning_rate": 2e-05, + "loss": 0.638, + "step": 48 + }, + { + "epoch": 0.06299212598425197, + "grad_norm": 0.03628067672252655, + "learning_rate": 2e-05, + "loss": 0.5031, + "step": 49 + }, + { + "epoch": 0.06427767957576731, + "grad_norm": 0.04399935156106949, + "learning_rate": 2e-05, + "loss": 0.6615, + "step": 50 + }, + { + "epoch": 0.06556323316728266, + "grad_norm": 0.04084352031350136, + "learning_rate": 2e-05, + "loss": 0.5703, + "step": 51 + }, + { + "epoch": 0.066848786758798, + "grad_norm": 0.039231687784194946, + "learning_rate": 2e-05, + "loss": 0.5325, + "step": 52 + }, + { + "epoch": 0.06813434035031335, + "grad_norm": 0.04078860580921173, + "learning_rate": 2e-05, + "loss": 0.5959, + "step": 53 + }, + { + "epoch": 0.0694198939418287, + "grad_norm": 0.03753922879695892, + "learning_rate": 2e-05, + "loss": 0.5652, + "step": 54 + }, + { + "epoch": 0.07070544753334404, + "grad_norm": 0.041337307542562485, + "learning_rate": 2e-05, + "loss": 0.5579, + "step": 55 + }, + { + "epoch": 0.0719910011248594, + "grad_norm": 0.03940434008836746, + "learning_rate": 2e-05, + "loss": 0.4821, + "step": 56 + }, + { + "epoch": 0.07327655471637474, + "grad_norm": 0.03760010376572609, + "learning_rate": 2e-05, + "loss": 0.4442, + "step": 57 + }, + { + "epoch": 0.07456210830789009, + "grad_norm": 0.042540181428194046, + "learning_rate": 2e-05, + "loss": 0.5285, + "step": 58 + }, + { + "epoch": 0.07584766189940544, + "grad_norm": 0.0457993820309639, + "learning_rate": 2e-05, + "loss": 0.7136, + "step": 59 + }, + { + "epoch": 0.07713321549092078, + "grad_norm": 0.033564481884241104, + "learning_rate": 2e-05, + "loss": 0.4617, + "step": 60 + }, + { + "epoch": 0.07841876908243613, + "grad_norm": 0.041546691209077835, + "learning_rate": 2e-05, + "loss": 0.6912, + "step": 61 + }, + { + "epoch": 0.07970432267395147, + "grad_norm": 0.03729071840643883, + "learning_rate": 2e-05, + "loss": 0.4584, + "step": 62 + }, + { + "epoch": 0.08098987626546682, + "grad_norm": 0.04159967973828316, + "learning_rate": 2e-05, + "loss": 0.5392, + "step": 63 + }, + { + "epoch": 0.08227542985698216, + "grad_norm": 0.03827968239784241, + "learning_rate": 2e-05, + "loss": 0.5548, + "step": 64 + }, + { + "epoch": 0.08356098344849751, + "grad_norm": 0.04405729100108147, + "learning_rate": 2e-05, + "loss": 0.6239, + "step": 65 + }, + { + "epoch": 0.08484653704001285, + "grad_norm": 0.03460558503866196, + "learning_rate": 2e-05, + "loss": 0.4405, + "step": 66 + }, + { + "epoch": 0.0861320906315282, + "grad_norm": 0.030664170160889626, + "learning_rate": 2e-05, + "loss": 0.3396, + "step": 67 + }, + { + "epoch": 0.08741764422304354, + "grad_norm": 0.0376565083861351, + "learning_rate": 2e-05, + "loss": 0.5822, + "step": 68 + }, + { + "epoch": 0.08870319781455889, + "grad_norm": 0.0384797677397728, + "learning_rate": 2e-05, + "loss": 0.5402, + "step": 69 + }, + { + "epoch": 0.08998875140607424, + "grad_norm": 0.030342888087034225, + "learning_rate": 2e-05, + "loss": 0.371, + "step": 70 + }, + { + "epoch": 0.09127430499758958, + "grad_norm": 0.0449620746076107, + "learning_rate": 2e-05, + "loss": 0.5723, + "step": 71 + }, + { + "epoch": 0.09255985858910494, + "grad_norm": 0.03808669000864029, + "learning_rate": 2e-05, + "loss": 0.4842, + "step": 72 + }, + { + "epoch": 0.09384541218062029, + "grad_norm": 0.03985065966844559, + "learning_rate": 2e-05, + "loss": 0.4957, + "step": 73 + }, + { + "epoch": 0.09513096577213563, + "grad_norm": 0.030943365767598152, + "learning_rate": 2e-05, + "loss": 0.4595, + "step": 74 + }, + { + "epoch": 0.09641651936365098, + "grad_norm": 0.03418966010212898, + "learning_rate": 2e-05, + "loss": 0.3757, + "step": 75 + }, + { + "epoch": 0.09770207295516632, + "grad_norm": 0.033448606729507446, + "learning_rate": 2e-05, + "loss": 0.4325, + "step": 76 + }, + { + "epoch": 0.09898762654668167, + "grad_norm": 0.039748664945364, + "learning_rate": 2e-05, + "loss": 0.507, + "step": 77 + }, + { + "epoch": 0.10027318013819701, + "grad_norm": 0.04277816414833069, + "learning_rate": 2e-05, + "loss": 0.6296, + "step": 78 + }, + { + "epoch": 0.10155873372971236, + "grad_norm": 0.029562752693891525, + "learning_rate": 2e-05, + "loss": 0.395, + "step": 79 + }, + { + "epoch": 0.1028442873212277, + "grad_norm": 0.029590601101517677, + "learning_rate": 2e-05, + "loss": 0.3807, + "step": 80 + }, + { + "epoch": 0.10412984091274305, + "grad_norm": 0.031173471361398697, + "learning_rate": 2e-05, + "loss": 0.4175, + "step": 81 + }, + { + "epoch": 0.1054153945042584, + "grad_norm": 0.03821694105863571, + "learning_rate": 2e-05, + "loss": 0.5547, + "step": 82 + }, + { + "epoch": 0.10670094809577374, + "grad_norm": 0.02932704985141754, + "learning_rate": 2e-05, + "loss": 0.3954, + "step": 83 + }, + { + "epoch": 0.10798650168728909, + "grad_norm": 0.030441921204328537, + "learning_rate": 2e-05, + "loss": 0.4564, + "step": 84 + }, + { + "epoch": 0.10927205527880443, + "grad_norm": 0.03350207954645157, + "learning_rate": 2e-05, + "loss": 0.4477, + "step": 85 + }, + { + "epoch": 0.11055760887031978, + "grad_norm": 0.030435308814048767, + "learning_rate": 2e-05, + "loss": 0.4292, + "step": 86 + }, + { + "epoch": 0.11184316246183512, + "grad_norm": 0.03452485054731369, + "learning_rate": 2e-05, + "loss": 0.4572, + "step": 87 + }, + { + "epoch": 0.11312871605335048, + "grad_norm": 0.029849708080291748, + "learning_rate": 2e-05, + "loss": 0.3826, + "step": 88 + }, + { + "epoch": 0.11441426964486583, + "grad_norm": 0.026589911431074142, + "learning_rate": 2e-05, + "loss": 0.3335, + "step": 89 + }, + { + "epoch": 0.11569982323638117, + "grad_norm": 0.03767862543463707, + "learning_rate": 2e-05, + "loss": 0.5377, + "step": 90 + }, + { + "epoch": 0.11698537682789652, + "grad_norm": 0.030503496527671814, + "learning_rate": 2e-05, + "loss": 0.4019, + "step": 91 + }, + { + "epoch": 0.11827093041941186, + "grad_norm": 0.02843611314892769, + "learning_rate": 2e-05, + "loss": 0.378, + "step": 92 + }, + { + "epoch": 0.11955648401092721, + "grad_norm": 0.02735988050699234, + "learning_rate": 2e-05, + "loss": 0.3842, + "step": 93 + }, + { + "epoch": 0.12084203760244255, + "grad_norm": 0.03628378361463547, + "learning_rate": 2e-05, + "loss": 0.502, + "step": 94 + }, + { + "epoch": 0.1221275911939579, + "grad_norm": 0.029980337247252464, + "learning_rate": 2e-05, + "loss": 0.4177, + "step": 95 + }, + { + "epoch": 0.12341314478547324, + "grad_norm": 0.03486626222729683, + "learning_rate": 2e-05, + "loss": 0.495, + "step": 96 + }, + { + "epoch": 0.12469869837698859, + "grad_norm": 0.03005075454711914, + "learning_rate": 2e-05, + "loss": 0.3618, + "step": 97 + }, + { + "epoch": 0.12598425196850394, + "grad_norm": 0.03018985688686371, + "learning_rate": 2e-05, + "loss": 0.4078, + "step": 98 + }, + { + "epoch": 0.1272698055600193, + "grad_norm": 0.03108677826821804, + "learning_rate": 2e-05, + "loss": 0.4583, + "step": 99 + }, + { + "epoch": 0.12855535915153463, + "grad_norm": 0.029582438990473747, + "learning_rate": 2e-05, + "loss": 0.4142, + "step": 100 + }, + { + "epoch": 0.12984091274304999, + "grad_norm": 0.02979620173573494, + "learning_rate": 2e-05, + "loss": 0.4535, + "step": 101 + }, + { + "epoch": 0.13112646633456532, + "grad_norm": 0.032250065356492996, + "learning_rate": 2e-05, + "loss": 0.3805, + "step": 102 + }, + { + "epoch": 0.13241201992608068, + "grad_norm": 0.03306899964809418, + "learning_rate": 2e-05, + "loss": 0.4351, + "step": 103 + }, + { + "epoch": 0.133697573517596, + "grad_norm": 0.023130670189857483, + "learning_rate": 2e-05, + "loss": 0.2417, + "step": 104 + }, + { + "epoch": 0.13498312710911137, + "grad_norm": 0.03372225537896156, + "learning_rate": 2e-05, + "loss": 0.4703, + "step": 105 + }, + { + "epoch": 0.1362686807006267, + "grad_norm": 0.02907857671380043, + "learning_rate": 2e-05, + "loss": 0.3437, + "step": 106 + }, + { + "epoch": 0.13755423429214206, + "grad_norm": 0.03021407686173916, + "learning_rate": 2e-05, + "loss": 0.4327, + "step": 107 + }, + { + "epoch": 0.1388397878836574, + "grad_norm": 0.027038615196943283, + "learning_rate": 2e-05, + "loss": 0.3652, + "step": 108 + }, + { + "epoch": 0.14012534147517275, + "grad_norm": 0.02982942759990692, + "learning_rate": 2e-05, + "loss": 0.345, + "step": 109 + }, + { + "epoch": 0.14141089506668808, + "grad_norm": 0.0561259388923645, + "learning_rate": 2e-05, + "loss": 0.5073, + "step": 110 + }, + { + "epoch": 0.14269644865820344, + "grad_norm": 0.024736687541007996, + "learning_rate": 2e-05, + "loss": 0.3149, + "step": 111 + }, + { + "epoch": 0.1439820022497188, + "grad_norm": 0.02275976352393627, + "learning_rate": 2e-05, + "loss": 0.3147, + "step": 112 + }, + { + "epoch": 0.14526755584123413, + "grad_norm": 0.030464742332696915, + "learning_rate": 2e-05, + "loss": 0.4512, + "step": 113 + }, + { + "epoch": 0.1465531094327495, + "grad_norm": 0.026887530460953712, + "learning_rate": 2e-05, + "loss": 0.3679, + "step": 114 + }, + { + "epoch": 0.14783866302426482, + "grad_norm": 0.03605503961443901, + "learning_rate": 2e-05, + "loss": 0.4392, + "step": 115 + }, + { + "epoch": 0.14912421661578018, + "grad_norm": 0.02638978883624077, + "learning_rate": 2e-05, + "loss": 0.3484, + "step": 116 + }, + { + "epoch": 0.1504097702072955, + "grad_norm": 0.03650350496172905, + "learning_rate": 2e-05, + "loss": 0.3978, + "step": 117 + }, + { + "epoch": 0.15169532379881087, + "grad_norm": 0.022277837619185448, + "learning_rate": 2e-05, + "loss": 0.2525, + "step": 118 + }, + { + "epoch": 0.1529808773903262, + "grad_norm": 0.021412434056401253, + "learning_rate": 2e-05, + "loss": 0.2922, + "step": 119 + }, + { + "epoch": 0.15426643098184156, + "grad_norm": 0.029154105111956596, + "learning_rate": 2e-05, + "loss": 0.3864, + "step": 120 + }, + { + "epoch": 0.1555519845733569, + "grad_norm": 0.024072440341114998, + "learning_rate": 2e-05, + "loss": 0.2467, + "step": 121 + }, + { + "epoch": 0.15683753816487225, + "grad_norm": 0.019447140395641327, + "learning_rate": 2e-05, + "loss": 0.2086, + "step": 122 + }, + { + "epoch": 0.15812309175638758, + "grad_norm": 0.035536400973796844, + "learning_rate": 2e-05, + "loss": 0.489, + "step": 123 + }, + { + "epoch": 0.15940864534790294, + "grad_norm": 0.026226134970784187, + "learning_rate": 2e-05, + "loss": 0.3502, + "step": 124 + }, + { + "epoch": 0.16069419893941828, + "grad_norm": 0.029284900054335594, + "learning_rate": 2e-05, + "loss": 0.3683, + "step": 125 + }, + { + "epoch": 0.16197975253093364, + "grad_norm": 0.026484966278076172, + "learning_rate": 2e-05, + "loss": 0.3686, + "step": 126 + }, + { + "epoch": 0.16326530612244897, + "grad_norm": 0.03296555206179619, + "learning_rate": 2e-05, + "loss": 0.4598, + "step": 127 + }, + { + "epoch": 0.16455085971396433, + "grad_norm": 0.03217398375272751, + "learning_rate": 2e-05, + "loss": 0.4292, + "step": 128 + }, + { + "epoch": 0.16583641330547969, + "grad_norm": 0.02639828808605671, + "learning_rate": 2e-05, + "loss": 0.324, + "step": 129 + }, + { + "epoch": 0.16712196689699502, + "grad_norm": 0.025398138910531998, + "learning_rate": 2e-05, + "loss": 0.3565, + "step": 130 + }, + { + "epoch": 0.16840752048851038, + "grad_norm": 0.026609797030687332, + "learning_rate": 2e-05, + "loss": 0.326, + "step": 131 + }, + { + "epoch": 0.1696930740800257, + "grad_norm": 0.029938040301203728, + "learning_rate": 2e-05, + "loss": 0.4149, + "step": 132 + }, + { + "epoch": 0.17097862767154107, + "grad_norm": 0.02608969807624817, + "learning_rate": 2e-05, + "loss": 0.3774, + "step": 133 + }, + { + "epoch": 0.1722641812630564, + "grad_norm": 0.02580363303422928, + "learning_rate": 2e-05, + "loss": 0.2944, + "step": 134 + }, + { + "epoch": 0.17354973485457176, + "grad_norm": 0.029851458966732025, + "learning_rate": 2e-05, + "loss": 0.3316, + "step": 135 + }, + { + "epoch": 0.1748352884460871, + "grad_norm": 0.02928406558930874, + "learning_rate": 2e-05, + "loss": 0.3548, + "step": 136 + }, + { + "epoch": 0.17612084203760245, + "grad_norm": 0.030875032767653465, + "learning_rate": 2e-05, + "loss": 0.3617, + "step": 137 + }, + { + "epoch": 0.17740639562911778, + "grad_norm": 0.026721350848674774, + "learning_rate": 2e-05, + "loss": 0.3799, + "step": 138 + }, + { + "epoch": 0.17869194922063314, + "grad_norm": 0.03269115090370178, + "learning_rate": 2e-05, + "loss": 0.4324, + "step": 139 + }, + { + "epoch": 0.17997750281214847, + "grad_norm": 0.022154508158564568, + "learning_rate": 2e-05, + "loss": 0.2744, + "step": 140 + }, + { + "epoch": 0.18126305640366383, + "grad_norm": 0.022251179441809654, + "learning_rate": 2e-05, + "loss": 0.2886, + "step": 141 + }, + { + "epoch": 0.18254860999517916, + "grad_norm": 0.03386593237519264, + "learning_rate": 2e-05, + "loss": 0.473, + "step": 142 + }, + { + "epoch": 0.18383416358669452, + "grad_norm": 0.02578306384384632, + "learning_rate": 2e-05, + "loss": 0.3224, + "step": 143 + }, + { + "epoch": 0.18511971717820988, + "grad_norm": 0.027509864419698715, + "learning_rate": 2e-05, + "loss": 0.3224, + "step": 144 + }, + { + "epoch": 0.1864052707697252, + "grad_norm": 0.02819378860294819, + "learning_rate": 2e-05, + "loss": 0.3176, + "step": 145 + }, + { + "epoch": 0.18769082436124057, + "grad_norm": 0.028061147779226303, + "learning_rate": 2e-05, + "loss": 0.3494, + "step": 146 + }, + { + "epoch": 0.1889763779527559, + "grad_norm": 0.032399386167526245, + "learning_rate": 2e-05, + "loss": 0.3647, + "step": 147 + }, + { + "epoch": 0.19026193154427126, + "grad_norm": 0.028246790170669556, + "learning_rate": 2e-05, + "loss": 0.3366, + "step": 148 + }, + { + "epoch": 0.1915474851357866, + "grad_norm": 0.03099609911441803, + "learning_rate": 2e-05, + "loss": 0.4034, + "step": 149 + }, + { + "epoch": 0.19283303872730195, + "grad_norm": 0.03750993683934212, + "learning_rate": 2e-05, + "loss": 0.3395, + "step": 150 + }, + { + "epoch": 0.19411859231881728, + "grad_norm": 0.0326780304312706, + "learning_rate": 2e-05, + "loss": 0.4482, + "step": 151 + }, + { + "epoch": 0.19540414591033264, + "grad_norm": 0.033816393464803696, + "learning_rate": 2e-05, + "loss": 0.4504, + "step": 152 + }, + { + "epoch": 0.19668969950184798, + "grad_norm": 0.026754887774586678, + "learning_rate": 2e-05, + "loss": 0.3334, + "step": 153 + }, + { + "epoch": 0.19797525309336333, + "grad_norm": 0.02957574650645256, + "learning_rate": 2e-05, + "loss": 0.3698, + "step": 154 + }, + { + "epoch": 0.19926080668487867, + "grad_norm": 0.02848845347762108, + "learning_rate": 2e-05, + "loss": 0.3001, + "step": 155 + }, + { + "epoch": 0.20054636027639403, + "grad_norm": 0.03636415675282478, + "learning_rate": 2e-05, + "loss": 0.4872, + "step": 156 + }, + { + "epoch": 0.20183191386790936, + "grad_norm": 0.018864037469029427, + "learning_rate": 2e-05, + "loss": 0.2086, + "step": 157 + }, + { + "epoch": 0.20311746745942472, + "grad_norm": 0.027126725763082504, + "learning_rate": 2e-05, + "loss": 0.3181, + "step": 158 + }, + { + "epoch": 0.20440302105094005, + "grad_norm": 0.025296056643128395, + "learning_rate": 2e-05, + "loss": 0.3169, + "step": 159 + }, + { + "epoch": 0.2056885746424554, + "grad_norm": 0.035376112908124924, + "learning_rate": 2e-05, + "loss": 0.4219, + "step": 160 + }, + { + "epoch": 0.20697412823397077, + "grad_norm": 0.030744420364499092, + "learning_rate": 2e-05, + "loss": 0.3818, + "step": 161 + }, + { + "epoch": 0.2082596818254861, + "grad_norm": 0.03273791819810867, + "learning_rate": 2e-05, + "loss": 0.3823, + "step": 162 + }, + { + "epoch": 0.20954523541700146, + "grad_norm": 0.030423806980252266, + "learning_rate": 2e-05, + "loss": 0.3451, + "step": 163 + }, + { + "epoch": 0.2108307890085168, + "grad_norm": 0.029618561267852783, + "learning_rate": 2e-05, + "loss": 0.3604, + "step": 164 + }, + { + "epoch": 0.21211634260003215, + "grad_norm": 0.030883729457855225, + "learning_rate": 2e-05, + "loss": 0.401, + "step": 165 + }, + { + "epoch": 0.21340189619154748, + "grad_norm": 0.028922105208039284, + "learning_rate": 2e-05, + "loss": 0.3305, + "step": 166 + }, + { + "epoch": 0.21468744978306284, + "grad_norm": 0.033665966242551804, + "learning_rate": 2e-05, + "loss": 0.3175, + "step": 167 + }, + { + "epoch": 0.21597300337457817, + "grad_norm": 0.035460278391838074, + "learning_rate": 2e-05, + "loss": 0.4509, + "step": 168 + }, + { + "epoch": 0.21725855696609353, + "grad_norm": 0.026533829048275948, + "learning_rate": 2e-05, + "loss": 0.2403, + "step": 169 + }, + { + "epoch": 0.21854411055760886, + "grad_norm": 0.029200293123722076, + "learning_rate": 2e-05, + "loss": 0.2794, + "step": 170 + }, + { + "epoch": 0.21982966414912422, + "grad_norm": 0.027879290282726288, + "learning_rate": 2e-05, + "loss": 0.2995, + "step": 171 + }, + { + "epoch": 0.22111521774063955, + "grad_norm": 0.027549387887120247, + "learning_rate": 2e-05, + "loss": 0.2803, + "step": 172 + }, + { + "epoch": 0.2224007713321549, + "grad_norm": 0.03113819658756256, + "learning_rate": 2e-05, + "loss": 0.2479, + "step": 173 + }, + { + "epoch": 0.22368632492367024, + "grad_norm": 0.024273231625556946, + "learning_rate": 2e-05, + "loss": 0.2705, + "step": 174 + }, + { + "epoch": 0.2249718785151856, + "grad_norm": 0.02970244735479355, + "learning_rate": 2e-05, + "loss": 0.3266, + "step": 175 + }, + { + "epoch": 0.22625743210670096, + "grad_norm": 0.028792355209589005, + "learning_rate": 2e-05, + "loss": 0.3311, + "step": 176 + }, + { + "epoch": 0.2275429856982163, + "grad_norm": 0.029121607542037964, + "learning_rate": 2e-05, + "loss": 0.2814, + "step": 177 + }, + { + "epoch": 0.22882853928973165, + "grad_norm": 0.029099591076374054, + "learning_rate": 2e-05, + "loss": 0.3065, + "step": 178 + }, + { + "epoch": 0.23011409288124698, + "grad_norm": 0.02833685837686062, + "learning_rate": 2e-05, + "loss": 0.3459, + "step": 179 + }, + { + "epoch": 0.23139964647276234, + "grad_norm": 0.03676662966609001, + "learning_rate": 2e-05, + "loss": 0.4179, + "step": 180 + }, + { + "epoch": 0.23268520006427768, + "grad_norm": 0.02846740558743477, + "learning_rate": 2e-05, + "loss": 0.2879, + "step": 181 + }, + { + "epoch": 0.23397075365579303, + "grad_norm": 0.030531803146004677, + "learning_rate": 2e-05, + "loss": 0.362, + "step": 182 + }, + { + "epoch": 0.23525630724730837, + "grad_norm": 0.034853462129831314, + "learning_rate": 2e-05, + "loss": 0.3814, + "step": 183 + }, + { + "epoch": 0.23654186083882373, + "grad_norm": 0.03336189687252045, + "learning_rate": 2e-05, + "loss": 0.4272, + "step": 184 + }, + { + "epoch": 0.23782741443033906, + "grad_norm": 0.03514046594500542, + "learning_rate": 2e-05, + "loss": 0.3432, + "step": 185 + }, + { + "epoch": 0.23911296802185442, + "grad_norm": 0.032468028366565704, + "learning_rate": 2e-05, + "loss": 0.405, + "step": 186 + }, + { + "epoch": 0.24039852161336975, + "grad_norm": 0.026813151314854622, + "learning_rate": 2e-05, + "loss": 0.2746, + "step": 187 + }, + { + "epoch": 0.2416840752048851, + "grad_norm": 0.03329463675618172, + "learning_rate": 2e-05, + "loss": 0.3566, + "step": 188 + }, + { + "epoch": 0.24296962879640044, + "grad_norm": 0.03253549337387085, + "learning_rate": 2e-05, + "loss": 0.3951, + "step": 189 + }, + { + "epoch": 0.2442551823879158, + "grad_norm": 0.03337908163666725, + "learning_rate": 2e-05, + "loss": 0.4022, + "step": 190 + }, + { + "epoch": 0.24554073597943113, + "grad_norm": 0.029503121972084045, + "learning_rate": 2e-05, + "loss": 0.3154, + "step": 191 + }, + { + "epoch": 0.2468262895709465, + "grad_norm": 0.03800208494067192, + "learning_rate": 2e-05, + "loss": 0.3716, + "step": 192 + }, + { + "epoch": 0.24811184316246185, + "grad_norm": 0.04471494257450104, + "learning_rate": 2e-05, + "loss": 0.3618, + "step": 193 + }, + { + "epoch": 0.24939739675397718, + "grad_norm": 0.03158828616142273, + "learning_rate": 2e-05, + "loss": 0.3035, + "step": 194 + }, + { + "epoch": 0.25068295034549254, + "grad_norm": 0.030343275517225266, + "learning_rate": 2e-05, + "loss": 0.3177, + "step": 195 + }, + { + "epoch": 0.25196850393700787, + "grad_norm": 0.027333417907357216, + "learning_rate": 2e-05, + "loss": 0.2688, + "step": 196 + }, + { + "epoch": 0.2532540575285232, + "grad_norm": 0.034231096506118774, + "learning_rate": 2e-05, + "loss": 0.2827, + "step": 197 + }, + { + "epoch": 0.2545396111200386, + "grad_norm": 0.042767249047756195, + "learning_rate": 2e-05, + "loss": 0.3617, + "step": 198 + }, + { + "epoch": 0.2558251647115539, + "grad_norm": 0.04363776370882988, + "learning_rate": 2e-05, + "loss": 0.428, + "step": 199 + }, + { + "epoch": 0.25711071830306925, + "grad_norm": 0.03701059892773628, + "learning_rate": 2e-05, + "loss": 0.4258, + "step": 200 + }, + { + "epoch": 0.2583962718945846, + "grad_norm": 0.03248538821935654, + "learning_rate": 2e-05, + "loss": 0.3127, + "step": 201 + }, + { + "epoch": 0.25968182548609997, + "grad_norm": 0.02792442962527275, + "learning_rate": 2e-05, + "loss": 0.2616, + "step": 202 + }, + { + "epoch": 0.2609673790776153, + "grad_norm": 0.02882961928844452, + "learning_rate": 2e-05, + "loss": 0.2822, + "step": 203 + }, + { + "epoch": 0.26225293266913063, + "grad_norm": 0.02498476952314377, + "learning_rate": 2e-05, + "loss": 0.2291, + "step": 204 + }, + { + "epoch": 0.26353848626064597, + "grad_norm": 0.0262466911226511, + "learning_rate": 2e-05, + "loss": 0.2084, + "step": 205 + }, + { + "epoch": 0.26482403985216135, + "grad_norm": 0.031161930412054062, + "learning_rate": 2e-05, + "loss": 0.2977, + "step": 206 + }, + { + "epoch": 0.2661095934436767, + "grad_norm": 0.03852604702115059, + "learning_rate": 2e-05, + "loss": 0.3606, + "step": 207 + }, + { + "epoch": 0.267395147035192, + "grad_norm": 0.03641024976968765, + "learning_rate": 2e-05, + "loss": 0.3855, + "step": 208 + }, + { + "epoch": 0.2686807006267074, + "grad_norm": 0.03774799406528473, + "learning_rate": 2e-05, + "loss": 0.3458, + "step": 209 + }, + { + "epoch": 0.26996625421822273, + "grad_norm": 0.04067372530698776, + "learning_rate": 2e-05, + "loss": 0.4515, + "step": 210 + }, + { + "epoch": 0.27125180780973807, + "grad_norm": 0.03964482620358467, + "learning_rate": 2e-05, + "loss": 0.4272, + "step": 211 + }, + { + "epoch": 0.2725373614012534, + "grad_norm": 0.02894040197134018, + "learning_rate": 2e-05, + "loss": 0.256, + "step": 212 + }, + { + "epoch": 0.2738229149927688, + "grad_norm": 0.036077771335840225, + "learning_rate": 2e-05, + "loss": 0.3755, + "step": 213 + }, + { + "epoch": 0.2751084685842841, + "grad_norm": 0.032988108694553375, + "learning_rate": 2e-05, + "loss": 0.3135, + "step": 214 + }, + { + "epoch": 0.27639402217579945, + "grad_norm": 0.02877802960574627, + "learning_rate": 2e-05, + "loss": 0.2762, + "step": 215 + }, + { + "epoch": 0.2776795757673148, + "grad_norm": 0.03700711205601692, + "learning_rate": 2e-05, + "loss": 0.3022, + "step": 216 + }, + { + "epoch": 0.27896512935883017, + "grad_norm": 0.03660174459218979, + "learning_rate": 2e-05, + "loss": 0.3265, + "step": 217 + }, + { + "epoch": 0.2802506829503455, + "grad_norm": 0.034895338118076324, + "learning_rate": 2e-05, + "loss": 0.3337, + "step": 218 + }, + { + "epoch": 0.28153623654186083, + "grad_norm": 0.029524167999625206, + "learning_rate": 2e-05, + "loss": 0.2872, + "step": 219 + }, + { + "epoch": 0.28282179013337616, + "grad_norm": 0.037102892994880676, + "learning_rate": 2e-05, + "loss": 0.3484, + "step": 220 + }, + { + "epoch": 0.28410734372489155, + "grad_norm": 0.02568918839097023, + "learning_rate": 2e-05, + "loss": 0.2352, + "step": 221 + }, + { + "epoch": 0.2853928973164069, + "grad_norm": 0.03680694103240967, + "learning_rate": 2e-05, + "loss": 0.3156, + "step": 222 + }, + { + "epoch": 0.2866784509079222, + "grad_norm": 0.03616785258054733, + "learning_rate": 2e-05, + "loss": 0.3435, + "step": 223 + }, + { + "epoch": 0.2879640044994376, + "grad_norm": 0.03019794449210167, + "learning_rate": 2e-05, + "loss": 0.2342, + "step": 224 + }, + { + "epoch": 0.28924955809095293, + "grad_norm": 0.029189620167016983, + "learning_rate": 2e-05, + "loss": 0.2622, + "step": 225 + }, + { + "epoch": 0.29053511168246826, + "grad_norm": 0.03722851350903511, + "learning_rate": 2e-05, + "loss": 0.3245, + "step": 226 + }, + { + "epoch": 0.2918206652739836, + "grad_norm": 0.028928019106388092, + "learning_rate": 2e-05, + "loss": 0.2444, + "step": 227 + }, + { + "epoch": 0.293106218865499, + "grad_norm": 0.03965122997760773, + "learning_rate": 2e-05, + "loss": 0.2914, + "step": 228 + }, + { + "epoch": 0.2943917724570143, + "grad_norm": 0.03618443012237549, + "learning_rate": 2e-05, + "loss": 0.2944, + "step": 229 + }, + { + "epoch": 0.29567732604852964, + "grad_norm": 0.04255329445004463, + "learning_rate": 2e-05, + "loss": 0.3803, + "step": 230 + }, + { + "epoch": 0.296962879640045, + "grad_norm": 0.03631114959716797, + "learning_rate": 2e-05, + "loss": 0.3529, + "step": 231 + }, + { + "epoch": 0.29824843323156036, + "grad_norm": 0.0347764790058136, + "learning_rate": 2e-05, + "loss": 0.2967, + "step": 232 + }, + { + "epoch": 0.2995339868230757, + "grad_norm": 0.03510100021958351, + "learning_rate": 2e-05, + "loss": 0.3316, + "step": 233 + }, + { + "epoch": 0.300819540414591, + "grad_norm": 0.03378084674477577, + "learning_rate": 2e-05, + "loss": 0.3318, + "step": 234 + }, + { + "epoch": 0.30210509400610636, + "grad_norm": 0.035719968378543854, + "learning_rate": 2e-05, + "loss": 0.2408, + "step": 235 + }, + { + "epoch": 0.30339064759762174, + "grad_norm": 0.03345809876918793, + "learning_rate": 2e-05, + "loss": 0.2553, + "step": 236 + }, + { + "epoch": 0.3046762011891371, + "grad_norm": 0.03555387631058693, + "learning_rate": 2e-05, + "loss": 0.2032, + "step": 237 + }, + { + "epoch": 0.3059617547806524, + "grad_norm": 0.037534430623054504, + "learning_rate": 2e-05, + "loss": 0.3482, + "step": 238 + }, + { + "epoch": 0.30724730837216774, + "grad_norm": 0.03810921311378479, + "learning_rate": 2e-05, + "loss": 0.2931, + "step": 239 + }, + { + "epoch": 0.3085328619636831, + "grad_norm": 0.03767091780900955, + "learning_rate": 2e-05, + "loss": 0.3031, + "step": 240 + }, + { + "epoch": 0.30981841555519846, + "grad_norm": 0.04636585712432861, + "learning_rate": 2e-05, + "loss": 0.3323, + "step": 241 + }, + { + "epoch": 0.3111039691467138, + "grad_norm": 0.02405642159283161, + "learning_rate": 2e-05, + "loss": 0.1964, + "step": 242 + }, + { + "epoch": 0.3123895227382292, + "grad_norm": 0.03820343688130379, + "learning_rate": 2e-05, + "loss": 0.265, + "step": 243 + }, + { + "epoch": 0.3136750763297445, + "grad_norm": 0.04235352948307991, + "learning_rate": 2e-05, + "loss": 0.3761, + "step": 244 + }, + { + "epoch": 0.31496062992125984, + "grad_norm": 0.02953983098268509, + "learning_rate": 2e-05, + "loss": 0.2458, + "step": 245 + }, + { + "epoch": 0.31624618351277517, + "grad_norm": 0.031593743711709976, + "learning_rate": 2e-05, + "loss": 0.2278, + "step": 246 + }, + { + "epoch": 0.31753173710429056, + "grad_norm": 0.033025920391082764, + "learning_rate": 2e-05, + "loss": 0.2967, + "step": 247 + }, + { + "epoch": 0.3188172906958059, + "grad_norm": 0.03608924522995949, + "learning_rate": 2e-05, + "loss": 0.3211, + "step": 248 + }, + { + "epoch": 0.3201028442873212, + "grad_norm": 0.029520737007260323, + "learning_rate": 2e-05, + "loss": 0.2659, + "step": 249 + }, + { + "epoch": 0.32138839787883655, + "grad_norm": 0.043838564306497574, + "learning_rate": 2e-05, + "loss": 0.4001, + "step": 250 + }, + { + "epoch": 0.32267395147035194, + "grad_norm": 0.03314085677266121, + "learning_rate": 2e-05, + "loss": 0.2669, + "step": 251 + }, + { + "epoch": 0.32395950506186727, + "grad_norm": 0.03647439181804657, + "learning_rate": 2e-05, + "loss": 0.3063, + "step": 252 + }, + { + "epoch": 0.3252450586533826, + "grad_norm": 0.03778000921010971, + "learning_rate": 2e-05, + "loss": 0.3437, + "step": 253 + }, + { + "epoch": 0.32653061224489793, + "grad_norm": 0.035549599677324295, + "learning_rate": 2e-05, + "loss": 0.3332, + "step": 254 + }, + { + "epoch": 0.3278161658364133, + "grad_norm": 0.033758629113435745, + "learning_rate": 2e-05, + "loss": 0.2372, + "step": 255 + }, + { + "epoch": 0.32910171942792865, + "grad_norm": 0.04042687267065048, + "learning_rate": 2e-05, + "loss": 0.2862, + "step": 256 + }, + { + "epoch": 0.330387273019444, + "grad_norm": 0.032794684171676636, + "learning_rate": 2e-05, + "loss": 0.2631, + "step": 257 + }, + { + "epoch": 0.33167282661095937, + "grad_norm": 0.03374920412898064, + "learning_rate": 2e-05, + "loss": 0.2703, + "step": 258 + }, + { + "epoch": 0.3329583802024747, + "grad_norm": 0.03981158137321472, + "learning_rate": 2e-05, + "loss": 0.3021, + "step": 259 + }, + { + "epoch": 0.33424393379399003, + "grad_norm": 0.034164056181907654, + "learning_rate": 2e-05, + "loss": 0.3245, + "step": 260 + }, + { + "epoch": 0.33552948738550537, + "grad_norm": 0.03673673793673515, + "learning_rate": 2e-05, + "loss": 0.3044, + "step": 261 + }, + { + "epoch": 0.33681504097702075, + "grad_norm": 0.04251427203416824, + "learning_rate": 2e-05, + "loss": 0.3132, + "step": 262 + }, + { + "epoch": 0.3381005945685361, + "grad_norm": 0.055292125791311264, + "learning_rate": 2e-05, + "loss": 0.3668, + "step": 263 + }, + { + "epoch": 0.3393861481600514, + "grad_norm": 0.03982202708721161, + "learning_rate": 2e-05, + "loss": 0.3389, + "step": 264 + }, + { + "epoch": 0.34067170175156675, + "grad_norm": 0.03548764809966087, + "learning_rate": 2e-05, + "loss": 0.2439, + "step": 265 + }, + { + "epoch": 0.34195725534308213, + "grad_norm": 0.04806696996092796, + "learning_rate": 2e-05, + "loss": 0.3923, + "step": 266 + }, + { + "epoch": 0.34324280893459747, + "grad_norm": 0.036050595343112946, + "learning_rate": 2e-05, + "loss": 0.2605, + "step": 267 + }, + { + "epoch": 0.3445283625261128, + "grad_norm": 0.032735515385866165, + "learning_rate": 2e-05, + "loss": 0.2451, + "step": 268 + }, + { + "epoch": 0.34581391611762813, + "grad_norm": 0.039695464074611664, + "learning_rate": 2e-05, + "loss": 0.3072, + "step": 269 + }, + { + "epoch": 0.3470994697091435, + "grad_norm": 0.027333933860063553, + "learning_rate": 2e-05, + "loss": 0.2099, + "step": 270 + }, + { + "epoch": 0.34838502330065885, + "grad_norm": 0.03149592876434326, + "learning_rate": 2e-05, + "loss": 0.2613, + "step": 271 + }, + { + "epoch": 0.3496705768921742, + "grad_norm": 0.031215226277709007, + "learning_rate": 2e-05, + "loss": 0.2833, + "step": 272 + }, + { + "epoch": 0.35095613048368957, + "grad_norm": 0.04059711471199989, + "learning_rate": 2e-05, + "loss": 0.3666, + "step": 273 + }, + { + "epoch": 0.3522416840752049, + "grad_norm": 0.04247285798192024, + "learning_rate": 2e-05, + "loss": 0.3758, + "step": 274 + }, + { + "epoch": 0.35352723766672023, + "grad_norm": 0.034378454089164734, + "learning_rate": 2e-05, + "loss": 0.2519, + "step": 275 + }, + { + "epoch": 0.35481279125823556, + "grad_norm": 0.037096619606018066, + "learning_rate": 2e-05, + "loss": 0.3256, + "step": 276 + }, + { + "epoch": 0.35609834484975095, + "grad_norm": 0.03536511957645416, + "learning_rate": 2e-05, + "loss": 0.2596, + "step": 277 + }, + { + "epoch": 0.3573838984412663, + "grad_norm": 0.046086303889751434, + "learning_rate": 2e-05, + "loss": 0.3132, + "step": 278 + }, + { + "epoch": 0.3586694520327816, + "grad_norm": 0.03302552178502083, + "learning_rate": 2e-05, + "loss": 0.2839, + "step": 279 + }, + { + "epoch": 0.35995500562429694, + "grad_norm": 0.03423115238547325, + "learning_rate": 2e-05, + "loss": 0.2678, + "step": 280 + }, + { + "epoch": 0.36124055921581233, + "grad_norm": 0.03363805264234543, + "learning_rate": 2e-05, + "loss": 0.2243, + "step": 281 + }, + { + "epoch": 0.36252611280732766, + "grad_norm": 0.03901209309697151, + "learning_rate": 2e-05, + "loss": 0.2956, + "step": 282 + }, + { + "epoch": 0.363811666398843, + "grad_norm": 0.03081115335226059, + "learning_rate": 2e-05, + "loss": 0.213, + "step": 283 + }, + { + "epoch": 0.3650972199903583, + "grad_norm": 0.04130322486162186, + "learning_rate": 2e-05, + "loss": 0.3161, + "step": 284 + }, + { + "epoch": 0.3663827735818737, + "grad_norm": 0.03694218024611473, + "learning_rate": 2e-05, + "loss": 0.3228, + "step": 285 + }, + { + "epoch": 0.36766832717338904, + "grad_norm": 0.048961639404296875, + "learning_rate": 2e-05, + "loss": 0.3688, + "step": 286 + }, + { + "epoch": 0.3689538807649044, + "grad_norm": 0.03482965752482414, + "learning_rate": 2e-05, + "loss": 0.2797, + "step": 287 + }, + { + "epoch": 0.37023943435641976, + "grad_norm": 0.043517641723155975, + "learning_rate": 2e-05, + "loss": 0.3395, + "step": 288 + }, + { + "epoch": 0.3715249879479351, + "grad_norm": 0.03916122019290924, + "learning_rate": 2e-05, + "loss": 0.3168, + "step": 289 + }, + { + "epoch": 0.3728105415394504, + "grad_norm": 0.03970535099506378, + "learning_rate": 2e-05, + "loss": 0.3523, + "step": 290 + }, + { + "epoch": 0.37409609513096576, + "grad_norm": 0.043576546013355255, + "learning_rate": 2e-05, + "loss": 0.3974, + "step": 291 + }, + { + "epoch": 0.37538164872248114, + "grad_norm": 0.03478504344820976, + "learning_rate": 2e-05, + "loss": 0.2663, + "step": 292 + }, + { + "epoch": 0.3766672023139965, + "grad_norm": 0.0442640446126461, + "learning_rate": 2e-05, + "loss": 0.2685, + "step": 293 + }, + { + "epoch": 0.3779527559055118, + "grad_norm": 0.04135148599743843, + "learning_rate": 2e-05, + "loss": 0.3765, + "step": 294 + }, + { + "epoch": 0.37923830949702714, + "grad_norm": 0.03744332864880562, + "learning_rate": 2e-05, + "loss": 0.3693, + "step": 295 + }, + { + "epoch": 0.3805238630885425, + "grad_norm": 0.038954440504312515, + "learning_rate": 2e-05, + "loss": 0.289, + "step": 296 + }, + { + "epoch": 0.38180941668005786, + "grad_norm": 0.031730618327856064, + "learning_rate": 2e-05, + "loss": 0.2271, + "step": 297 + }, + { + "epoch": 0.3830949702715732, + "grad_norm": 0.04433518648147583, + "learning_rate": 2e-05, + "loss": 0.4079, + "step": 298 + }, + { + "epoch": 0.3843805238630885, + "grad_norm": 0.04384070262312889, + "learning_rate": 2e-05, + "loss": 0.3005, + "step": 299 + }, + { + "epoch": 0.3856660774546039, + "grad_norm": 0.03004288114607334, + "learning_rate": 2e-05, + "loss": 0.2113, + "step": 300 + }, + { + "epoch": 0.38695163104611924, + "grad_norm": 0.0353570394217968, + "learning_rate": 2e-05, + "loss": 0.2198, + "step": 301 + }, + { + "epoch": 0.38823718463763457, + "grad_norm": 0.04267432913184166, + "learning_rate": 2e-05, + "loss": 0.3431, + "step": 302 + }, + { + "epoch": 0.3895227382291499, + "grad_norm": 0.04084617272019386, + "learning_rate": 2e-05, + "loss": 0.3099, + "step": 303 + }, + { + "epoch": 0.3908082918206653, + "grad_norm": 0.059954188764095306, + "learning_rate": 2e-05, + "loss": 0.3891, + "step": 304 + }, + { + "epoch": 0.3920938454121806, + "grad_norm": 0.03467090055346489, + "learning_rate": 2e-05, + "loss": 0.2383, + "step": 305 + }, + { + "epoch": 0.39337939900369595, + "grad_norm": 0.03164566680788994, + "learning_rate": 2e-05, + "loss": 0.2079, + "step": 306 + }, + { + "epoch": 0.39466495259521134, + "grad_norm": 0.048123132437467575, + "learning_rate": 2e-05, + "loss": 0.3726, + "step": 307 + }, + { + "epoch": 0.39595050618672667, + "grad_norm": 0.03534458950161934, + "learning_rate": 2e-05, + "loss": 0.2651, + "step": 308 + }, + { + "epoch": 0.397236059778242, + "grad_norm": 0.03836483508348465, + "learning_rate": 2e-05, + "loss": 0.3101, + "step": 309 + }, + { + "epoch": 0.39852161336975733, + "grad_norm": 0.047910891473293304, + "learning_rate": 2e-05, + "loss": 0.3234, + "step": 310 + }, + { + "epoch": 0.3998071669612727, + "grad_norm": 0.027741173282265663, + "learning_rate": 2e-05, + "loss": 0.1632, + "step": 311 + }, + { + "epoch": 0.40109272055278805, + "grad_norm": 0.0344574935734272, + "learning_rate": 2e-05, + "loss": 0.2463, + "step": 312 + }, + { + "epoch": 0.4023782741443034, + "grad_norm": 0.032118018716573715, + "learning_rate": 2e-05, + "loss": 0.2298, + "step": 313 + }, + { + "epoch": 0.4036638277358187, + "grad_norm": 0.040490612387657166, + "learning_rate": 2e-05, + "loss": 0.3247, + "step": 314 + }, + { + "epoch": 0.4049493813273341, + "grad_norm": 0.03369493409991264, + "learning_rate": 2e-05, + "loss": 0.2088, + "step": 315 + }, + { + "epoch": 0.40623493491884943, + "grad_norm": 0.04419386386871338, + "learning_rate": 2e-05, + "loss": 0.3354, + "step": 316 + }, + { + "epoch": 0.40752048851036476, + "grad_norm": 0.04048989340662956, + "learning_rate": 2e-05, + "loss": 0.2988, + "step": 317 + }, + { + "epoch": 0.4088060421018801, + "grad_norm": 0.040915414690971375, + "learning_rate": 2e-05, + "loss": 0.2315, + "step": 318 + }, + { + "epoch": 0.4100915956933955, + "grad_norm": 0.03020886704325676, + "learning_rate": 2e-05, + "loss": 0.2137, + "step": 319 + }, + { + "epoch": 0.4113771492849108, + "grad_norm": 0.0413849912583828, + "learning_rate": 2e-05, + "loss": 0.3479, + "step": 320 + }, + { + "epoch": 0.41266270287642615, + "grad_norm": 0.04639044404029846, + "learning_rate": 2e-05, + "loss": 0.3689, + "step": 321 + }, + { + "epoch": 0.41394825646794153, + "grad_norm": 0.044351786375045776, + "learning_rate": 2e-05, + "loss": 0.3488, + "step": 322 + }, + { + "epoch": 0.41523381005945686, + "grad_norm": 0.030558589845895767, + "learning_rate": 2e-05, + "loss": 0.2211, + "step": 323 + }, + { + "epoch": 0.4165193636509722, + "grad_norm": 0.03329205513000488, + "learning_rate": 2e-05, + "loss": 0.2282, + "step": 324 + }, + { + "epoch": 0.41780491724248753, + "grad_norm": 0.04240158200263977, + "learning_rate": 2e-05, + "loss": 0.2571, + "step": 325 + }, + { + "epoch": 0.4190904708340029, + "grad_norm": 0.040866266936063766, + "learning_rate": 2e-05, + "loss": 0.289, + "step": 326 + }, + { + "epoch": 0.42037602442551825, + "grad_norm": 0.04475086182355881, + "learning_rate": 2e-05, + "loss": 0.2889, + "step": 327 + }, + { + "epoch": 0.4216615780170336, + "grad_norm": 0.03587472438812256, + "learning_rate": 2e-05, + "loss": 0.2452, + "step": 328 + }, + { + "epoch": 0.4229471316085489, + "grad_norm": 0.04346352815628052, + "learning_rate": 2e-05, + "loss": 0.3751, + "step": 329 + }, + { + "epoch": 0.4242326852000643, + "grad_norm": 0.03417763113975525, + "learning_rate": 2e-05, + "loss": 0.2825, + "step": 330 + }, + { + "epoch": 0.42551823879157963, + "grad_norm": 0.030223989859223366, + "learning_rate": 2e-05, + "loss": 0.2339, + "step": 331 + }, + { + "epoch": 0.42680379238309496, + "grad_norm": 0.0342961922287941, + "learning_rate": 2e-05, + "loss": 0.2616, + "step": 332 + }, + { + "epoch": 0.4280893459746103, + "grad_norm": 0.04207473620772362, + "learning_rate": 2e-05, + "loss": 0.265, + "step": 333 + }, + { + "epoch": 0.4293748995661257, + "grad_norm": 0.03148888424038887, + "learning_rate": 2e-05, + "loss": 0.1792, + "step": 334 + }, + { + "epoch": 0.430660453157641, + "grad_norm": 0.039937492460012436, + "learning_rate": 2e-05, + "loss": 0.2502, + "step": 335 + }, + { + "epoch": 0.43194600674915634, + "grad_norm": 0.03943054750561714, + "learning_rate": 2e-05, + "loss": 0.2733, + "step": 336 + }, + { + "epoch": 0.43323156034067173, + "grad_norm": 0.03569771721959114, + "learning_rate": 2e-05, + "loss": 0.2099, + "step": 337 + }, + { + "epoch": 0.43451711393218706, + "grad_norm": 0.036599624902009964, + "learning_rate": 2e-05, + "loss": 0.2478, + "step": 338 + }, + { + "epoch": 0.4358026675237024, + "grad_norm": 0.054707758128643036, + "learning_rate": 2e-05, + "loss": 0.4257, + "step": 339 + }, + { + "epoch": 0.4370882211152177, + "grad_norm": 0.0450870580971241, + "learning_rate": 2e-05, + "loss": 0.3091, + "step": 340 + }, + { + "epoch": 0.4383737747067331, + "grad_norm": 0.03818565234541893, + "learning_rate": 2e-05, + "loss": 0.2781, + "step": 341 + }, + { + "epoch": 0.43965932829824844, + "grad_norm": 0.03722561523318291, + "learning_rate": 2e-05, + "loss": 0.2602, + "step": 342 + }, + { + "epoch": 0.4409448818897638, + "grad_norm": 0.038348764181137085, + "learning_rate": 2e-05, + "loss": 0.286, + "step": 343 + }, + { + "epoch": 0.4422304354812791, + "grad_norm": 0.02572775073349476, + "learning_rate": 2e-05, + "loss": 0.1886, + "step": 344 + }, + { + "epoch": 0.4435159890727945, + "grad_norm": 0.03972122073173523, + "learning_rate": 2e-05, + "loss": 0.2756, + "step": 345 + }, + { + "epoch": 0.4448015426643098, + "grad_norm": 0.03696167469024658, + "learning_rate": 2e-05, + "loss": 0.2526, + "step": 346 + }, + { + "epoch": 0.44608709625582516, + "grad_norm": 0.03587668761610985, + "learning_rate": 2e-05, + "loss": 0.2044, + "step": 347 + }, + { + "epoch": 0.4473726498473405, + "grad_norm": 0.03959975019097328, + "learning_rate": 2e-05, + "loss": 0.3007, + "step": 348 + }, + { + "epoch": 0.4486582034388559, + "grad_norm": 0.03879138454794884, + "learning_rate": 2e-05, + "loss": 0.2382, + "step": 349 + }, + { + "epoch": 0.4499437570303712, + "grad_norm": 0.05302846059203148, + "learning_rate": 2e-05, + "loss": 0.3375, + "step": 350 + }, + { + "epoch": 0.45122931062188654, + "grad_norm": 0.039411693811416626, + "learning_rate": 2e-05, + "loss": 0.2662, + "step": 351 + }, + { + "epoch": 0.4525148642134019, + "grad_norm": 0.03571093827486038, + "learning_rate": 2e-05, + "loss": 0.2054, + "step": 352 + }, + { + "epoch": 0.45380041780491726, + "grad_norm": 0.0486789233982563, + "learning_rate": 2e-05, + "loss": 0.3314, + "step": 353 + }, + { + "epoch": 0.4550859713964326, + "grad_norm": 0.037670183926820755, + "learning_rate": 2e-05, + "loss": 0.2484, + "step": 354 + }, + { + "epoch": 0.4563715249879479, + "grad_norm": 0.056887123733758926, + "learning_rate": 2e-05, + "loss": 0.3562, + "step": 355 + }, + { + "epoch": 0.4576570785794633, + "grad_norm": 0.04562405124306679, + "learning_rate": 2e-05, + "loss": 0.2869, + "step": 356 + }, + { + "epoch": 0.45894263217097864, + "grad_norm": 0.040491264313459396, + "learning_rate": 2e-05, + "loss": 0.3541, + "step": 357 + }, + { + "epoch": 0.46022818576249397, + "grad_norm": 0.04283326864242554, + "learning_rate": 2e-05, + "loss": 0.2674, + "step": 358 + }, + { + "epoch": 0.4615137393540093, + "grad_norm": 0.05063975229859352, + "learning_rate": 2e-05, + "loss": 0.4013, + "step": 359 + }, + { + "epoch": 0.4627992929455247, + "grad_norm": 0.037555571645498276, + "learning_rate": 2e-05, + "loss": 0.2419, + "step": 360 + }, + { + "epoch": 0.46408484653704, + "grad_norm": 0.036944594234228134, + "learning_rate": 2e-05, + "loss": 0.2426, + "step": 361 + }, + { + "epoch": 0.46537040012855535, + "grad_norm": 0.05010130628943443, + "learning_rate": 2e-05, + "loss": 0.2996, + "step": 362 + }, + { + "epoch": 0.4666559537200707, + "grad_norm": 0.0335206501185894, + "learning_rate": 2e-05, + "loss": 0.2451, + "step": 363 + }, + { + "epoch": 0.46794150731158607, + "grad_norm": 0.052481383085250854, + "learning_rate": 2e-05, + "loss": 0.3812, + "step": 364 + }, + { + "epoch": 0.4692270609031014, + "grad_norm": 0.04185234755277634, + "learning_rate": 2e-05, + "loss": 0.274, + "step": 365 + }, + { + "epoch": 0.47051261449461673, + "grad_norm": 0.03707558289170265, + "learning_rate": 2e-05, + "loss": 0.2505, + "step": 366 + }, + { + "epoch": 0.47179816808613206, + "grad_norm": 0.060728251934051514, + "learning_rate": 2e-05, + "loss": 0.3279, + "step": 367 + }, + { + "epoch": 0.47308372167764745, + "grad_norm": 0.031999371945858, + "learning_rate": 2e-05, + "loss": 0.1866, + "step": 368 + }, + { + "epoch": 0.4743692752691628, + "grad_norm": 0.044399287551641464, + "learning_rate": 2e-05, + "loss": 0.249, + "step": 369 + }, + { + "epoch": 0.4756548288606781, + "grad_norm": 0.05057983100414276, + "learning_rate": 2e-05, + "loss": 0.3612, + "step": 370 + }, + { + "epoch": 0.4769403824521935, + "grad_norm": 0.039979059249162674, + "learning_rate": 2e-05, + "loss": 0.2684, + "step": 371 + }, + { + "epoch": 0.47822593604370883, + "grad_norm": 0.03305087611079216, + "learning_rate": 2e-05, + "loss": 0.2164, + "step": 372 + }, + { + "epoch": 0.47951148963522416, + "grad_norm": 0.045574892312288284, + "learning_rate": 2e-05, + "loss": 0.3127, + "step": 373 + }, + { + "epoch": 0.4807970432267395, + "grad_norm": 0.05269627645611763, + "learning_rate": 2e-05, + "loss": 0.3315, + "step": 374 + }, + { + "epoch": 0.4820825968182549, + "grad_norm": 0.06162478029727936, + "learning_rate": 2e-05, + "loss": 0.3347, + "step": 375 + }, + { + "epoch": 0.4833681504097702, + "grad_norm": 0.04428340122103691, + "learning_rate": 2e-05, + "loss": 0.2794, + "step": 376 + }, + { + "epoch": 0.48465370400128555, + "grad_norm": 0.04249970242381096, + "learning_rate": 2e-05, + "loss": 0.2781, + "step": 377 + }, + { + "epoch": 0.4859392575928009, + "grad_norm": 0.04270468279719353, + "learning_rate": 2e-05, + "loss": 0.2878, + "step": 378 + }, + { + "epoch": 0.48722481118431626, + "grad_norm": 0.036853183060884476, + "learning_rate": 2e-05, + "loss": 0.2548, + "step": 379 + }, + { + "epoch": 0.4885103647758316, + "grad_norm": 0.03981437534093857, + "learning_rate": 2e-05, + "loss": 0.2743, + "step": 380 + }, + { + "epoch": 0.4897959183673469, + "grad_norm": 0.04621482267975807, + "learning_rate": 2e-05, + "loss": 0.3524, + "step": 381 + }, + { + "epoch": 0.49108147195886226, + "grad_norm": 0.04479382932186127, + "learning_rate": 2e-05, + "loss": 0.3013, + "step": 382 + }, + { + "epoch": 0.49236702555037765, + "grad_norm": 0.0524832084774971, + "learning_rate": 2e-05, + "loss": 0.4674, + "step": 383 + }, + { + "epoch": 0.493652579141893, + "grad_norm": 0.05657699331641197, + "learning_rate": 2e-05, + "loss": 0.3734, + "step": 384 + }, + { + "epoch": 0.4949381327334083, + "grad_norm": 0.05035189166665077, + "learning_rate": 2e-05, + "loss": 0.2522, + "step": 385 + }, + { + "epoch": 0.4962236863249237, + "grad_norm": 0.045344091951847076, + "learning_rate": 2e-05, + "loss": 0.3121, + "step": 386 + }, + { + "epoch": 0.49750923991643903, + "grad_norm": 0.038680486381053925, + "learning_rate": 2e-05, + "loss": 0.2999, + "step": 387 + }, + { + "epoch": 0.49879479350795436, + "grad_norm": 0.03980954363942146, + "learning_rate": 2e-05, + "loss": 0.2476, + "step": 388 + }, + { + "epoch": 0.5000803470994697, + "grad_norm": 0.04812563210725784, + "learning_rate": 2e-05, + "loss": 0.3218, + "step": 389 + }, + { + "epoch": 0.5013659006909851, + "grad_norm": 0.04132760316133499, + "learning_rate": 2e-05, + "loss": 0.2344, + "step": 390 + }, + { + "epoch": 0.5026514542825004, + "grad_norm": 0.03867589682340622, + "learning_rate": 2e-05, + "loss": 0.2172, + "step": 391 + }, + { + "epoch": 0.5039370078740157, + "grad_norm": 0.05404170975089073, + "learning_rate": 2e-05, + "loss": 0.3489, + "step": 392 + }, + { + "epoch": 0.5052225614655311, + "grad_norm": 0.05424851179122925, + "learning_rate": 2e-05, + "loss": 0.3908, + "step": 393 + }, + { + "epoch": 0.5065081150570464, + "grad_norm": 0.046993743628263474, + "learning_rate": 2e-05, + "loss": 0.3133, + "step": 394 + }, + { + "epoch": 0.5077936686485618, + "grad_norm": 0.038952894508838654, + "learning_rate": 2e-05, + "loss": 0.273, + "step": 395 + }, + { + "epoch": 0.5090792222400772, + "grad_norm": 0.039642345160245895, + "learning_rate": 2e-05, + "loss": 0.2163, + "step": 396 + }, + { + "epoch": 0.5103647758315925, + "grad_norm": 0.05045924335718155, + "learning_rate": 2e-05, + "loss": 0.3934, + "step": 397 + }, + { + "epoch": 0.5116503294231078, + "grad_norm": 0.03384791314601898, + "learning_rate": 2e-05, + "loss": 0.2427, + "step": 398 + }, + { + "epoch": 0.5129358830146231, + "grad_norm": 0.04521351680159569, + "learning_rate": 2e-05, + "loss": 0.3329, + "step": 399 + }, + { + "epoch": 0.5142214366061385, + "grad_norm": 0.044563427567481995, + "learning_rate": 2e-05, + "loss": 0.327, + "step": 400 + }, + { + "epoch": 0.5155069901976539, + "grad_norm": 0.027659917250275612, + "learning_rate": 2e-05, + "loss": 0.1984, + "step": 401 + }, + { + "epoch": 0.5167925437891692, + "grad_norm": 0.047275714576244354, + "learning_rate": 2e-05, + "loss": 0.2867, + "step": 402 + }, + { + "epoch": 0.5180780973806846, + "grad_norm": 0.04775230586528778, + "learning_rate": 2e-05, + "loss": 0.355, + "step": 403 + }, + { + "epoch": 0.5193636509721999, + "grad_norm": 0.04720161855220795, + "learning_rate": 2e-05, + "loss": 0.2423, + "step": 404 + }, + { + "epoch": 0.5206492045637152, + "grad_norm": 0.04180417209863663, + "learning_rate": 2e-05, + "loss": 0.2688, + "step": 405 + }, + { + "epoch": 0.5219347581552306, + "grad_norm": 0.05189646780490875, + "learning_rate": 2e-05, + "loss": 0.3736, + "step": 406 + }, + { + "epoch": 0.523220311746746, + "grad_norm": 0.04067251831293106, + "learning_rate": 2e-05, + "loss": 0.3039, + "step": 407 + }, + { + "epoch": 0.5245058653382613, + "grad_norm": 0.05931917205452919, + "learning_rate": 2e-05, + "loss": 0.3374, + "step": 408 + }, + { + "epoch": 0.5257914189297767, + "grad_norm": 0.04547608271241188, + "learning_rate": 2e-05, + "loss": 0.2968, + "step": 409 + }, + { + "epoch": 0.5270769725212919, + "grad_norm": 0.04650389403104782, + "learning_rate": 2e-05, + "loss": 0.2854, + "step": 410 + }, + { + "epoch": 0.5283625261128073, + "grad_norm": 0.05240015685558319, + "learning_rate": 2e-05, + "loss": 0.2837, + "step": 411 + }, + { + "epoch": 0.5296480797043227, + "grad_norm": 0.05040004476904869, + "learning_rate": 2e-05, + "loss": 0.2923, + "step": 412 + }, + { + "epoch": 0.530933633295838, + "grad_norm": 0.04871930554509163, + "learning_rate": 2e-05, + "loss": 0.2414, + "step": 413 + }, + { + "epoch": 0.5322191868873534, + "grad_norm": 0.04192574322223663, + "learning_rate": 2e-05, + "loss": 0.2764, + "step": 414 + }, + { + "epoch": 0.5335047404788688, + "grad_norm": 0.05296563729643822, + "learning_rate": 2e-05, + "loss": 0.2723, + "step": 415 + }, + { + "epoch": 0.534790294070384, + "grad_norm": 0.03959592431783676, + "learning_rate": 2e-05, + "loss": 0.2204, + "step": 416 + }, + { + "epoch": 0.5360758476618994, + "grad_norm": 0.03962741047143936, + "learning_rate": 2e-05, + "loss": 0.2518, + "step": 417 + }, + { + "epoch": 0.5373614012534148, + "grad_norm": 0.040081944316625595, + "learning_rate": 2e-05, + "loss": 0.2573, + "step": 418 + }, + { + "epoch": 0.5386469548449301, + "grad_norm": 0.04713954031467438, + "learning_rate": 2e-05, + "loss": 0.2925, + "step": 419 + }, + { + "epoch": 0.5399325084364455, + "grad_norm": 0.05657007545232773, + "learning_rate": 2e-05, + "loss": 0.4272, + "step": 420 + }, + { + "epoch": 0.5412180620279607, + "grad_norm": 0.05307560786604881, + "learning_rate": 2e-05, + "loss": 0.316, + "step": 421 + }, + { + "epoch": 0.5425036156194761, + "grad_norm": 0.04280155152082443, + "learning_rate": 2e-05, + "loss": 0.2614, + "step": 422 + }, + { + "epoch": 0.5437891692109915, + "grad_norm": 0.03501439467072487, + "learning_rate": 2e-05, + "loss": 0.2318, + "step": 423 + }, + { + "epoch": 0.5450747228025068, + "grad_norm": 0.05088590830564499, + "learning_rate": 2e-05, + "loss": 0.3533, + "step": 424 + }, + { + "epoch": 0.5463602763940222, + "grad_norm": 0.03503134846687317, + "learning_rate": 2e-05, + "loss": 0.2079, + "step": 425 + }, + { + "epoch": 0.5476458299855376, + "grad_norm": 0.043812718242406845, + "learning_rate": 2e-05, + "loss": 0.3205, + "step": 426 + }, + { + "epoch": 0.5489313835770528, + "grad_norm": 0.05358745902776718, + "learning_rate": 2e-05, + "loss": 0.3713, + "step": 427 + }, + { + "epoch": 0.5502169371685682, + "grad_norm": 0.042078517377376556, + "learning_rate": 2e-05, + "loss": 0.2371, + "step": 428 + }, + { + "epoch": 0.5515024907600835, + "grad_norm": 0.04489399120211601, + "learning_rate": 2e-05, + "loss": 0.2832, + "step": 429 + }, + { + "epoch": 0.5527880443515989, + "grad_norm": 0.04766567423939705, + "learning_rate": 2e-05, + "loss": 0.2151, + "step": 430 + }, + { + "epoch": 0.5540735979431143, + "grad_norm": 0.04447382688522339, + "learning_rate": 2e-05, + "loss": 0.2317, + "step": 431 + }, + { + "epoch": 0.5553591515346296, + "grad_norm": 0.04144001007080078, + "learning_rate": 2e-05, + "loss": 0.2667, + "step": 432 + }, + { + "epoch": 0.556644705126145, + "grad_norm": 0.04112810641527176, + "learning_rate": 2e-05, + "loss": 0.2758, + "step": 433 + }, + { + "epoch": 0.5579302587176603, + "grad_norm": 0.032402511686086655, + "learning_rate": 2e-05, + "loss": 0.2047, + "step": 434 + }, + { + "epoch": 0.5592158123091756, + "grad_norm": 0.04352883994579315, + "learning_rate": 2e-05, + "loss": 0.323, + "step": 435 + }, + { + "epoch": 0.560501365900691, + "grad_norm": 0.0496109239757061, + "learning_rate": 2e-05, + "loss": 0.2962, + "step": 436 + }, + { + "epoch": 0.5617869194922064, + "grad_norm": 0.04593720659613609, + "learning_rate": 2e-05, + "loss": 0.3705, + "step": 437 + }, + { + "epoch": 0.5630724730837217, + "grad_norm": 0.040998801589012146, + "learning_rate": 2e-05, + "loss": 0.2219, + "step": 438 + }, + { + "epoch": 0.564358026675237, + "grad_norm": 0.04891293868422508, + "learning_rate": 2e-05, + "loss": 0.2539, + "step": 439 + }, + { + "epoch": 0.5656435802667523, + "grad_norm": 0.04628092423081398, + "learning_rate": 2e-05, + "loss": 0.2521, + "step": 440 + }, + { + "epoch": 0.5669291338582677, + "grad_norm": 0.03929414600133896, + "learning_rate": 2e-05, + "loss": 0.1931, + "step": 441 + }, + { + "epoch": 0.5682146874497831, + "grad_norm": 0.03937762975692749, + "learning_rate": 2e-05, + "loss": 0.2225, + "step": 442 + }, + { + "epoch": 0.5695002410412984, + "grad_norm": 0.057498469948768616, + "learning_rate": 2e-05, + "loss": 0.4021, + "step": 443 + }, + { + "epoch": 0.5707857946328138, + "grad_norm": 0.04665215313434601, + "learning_rate": 2e-05, + "loss": 0.3026, + "step": 444 + }, + { + "epoch": 0.5720713482243291, + "grad_norm": 0.04521113634109497, + "learning_rate": 2e-05, + "loss": 0.2592, + "step": 445 + }, + { + "epoch": 0.5733569018158444, + "grad_norm": 0.038349051028490067, + "learning_rate": 2e-05, + "loss": 0.2501, + "step": 446 + }, + { + "epoch": 0.5746424554073598, + "grad_norm": 0.04515808820724487, + "learning_rate": 2e-05, + "loss": 0.3092, + "step": 447 + }, + { + "epoch": 0.5759280089988752, + "grad_norm": 0.047012921422719955, + "learning_rate": 2e-05, + "loss": 0.3338, + "step": 448 + }, + { + "epoch": 0.5772135625903905, + "grad_norm": 0.0472906231880188, + "learning_rate": 2e-05, + "loss": 0.3139, + "step": 449 + }, + { + "epoch": 0.5784991161819059, + "grad_norm": 0.04748733341693878, + "learning_rate": 2e-05, + "loss": 0.2414, + "step": 450 + }, + { + "epoch": 0.5797846697734211, + "grad_norm": 0.03514058515429497, + "learning_rate": 2e-05, + "loss": 0.1946, + "step": 451 + }, + { + "epoch": 0.5810702233649365, + "grad_norm": 0.050174906849861145, + "learning_rate": 2e-05, + "loss": 0.3284, + "step": 452 + }, + { + "epoch": 0.5823557769564519, + "grad_norm": 0.05283737555146217, + "learning_rate": 2e-05, + "loss": 0.3073, + "step": 453 + }, + { + "epoch": 0.5836413305479672, + "grad_norm": 0.04498602822422981, + "learning_rate": 2e-05, + "loss": 0.2604, + "step": 454 + }, + { + "epoch": 0.5849268841394826, + "grad_norm": 0.042758163064718246, + "learning_rate": 2e-05, + "loss": 0.2221, + "step": 455 + }, + { + "epoch": 0.586212437730998, + "grad_norm": 0.041656941175460815, + "learning_rate": 2e-05, + "loss": 0.2491, + "step": 456 + }, + { + "epoch": 0.5874979913225132, + "grad_norm": 0.03713398054242134, + "learning_rate": 2e-05, + "loss": 0.1754, + "step": 457 + }, + { + "epoch": 0.5887835449140286, + "grad_norm": 0.0447508729994297, + "learning_rate": 2e-05, + "loss": 0.2792, + "step": 458 + }, + { + "epoch": 0.5900690985055439, + "grad_norm": 0.04686212167143822, + "learning_rate": 2e-05, + "loss": 0.2609, + "step": 459 + }, + { + "epoch": 0.5913546520970593, + "grad_norm": 0.040732961148023605, + "learning_rate": 2e-05, + "loss": 0.2089, + "step": 460 + }, + { + "epoch": 0.5926402056885747, + "grad_norm": 0.04114542156457901, + "learning_rate": 2e-05, + "loss": 0.2315, + "step": 461 + }, + { + "epoch": 0.59392575928009, + "grad_norm": 0.040324702858924866, + "learning_rate": 2e-05, + "loss": 0.2778, + "step": 462 + }, + { + "epoch": 0.5952113128716053, + "grad_norm": 0.0678023248910904, + "learning_rate": 2e-05, + "loss": 0.3029, + "step": 463 + }, + { + "epoch": 0.5964968664631207, + "grad_norm": 0.04701264947652817, + "learning_rate": 2e-05, + "loss": 0.2829, + "step": 464 + }, + { + "epoch": 0.597782420054636, + "grad_norm": 0.03481682017445564, + "learning_rate": 2e-05, + "loss": 0.2345, + "step": 465 + }, + { + "epoch": 0.5990679736461514, + "grad_norm": 0.0509064756333828, + "learning_rate": 2e-05, + "loss": 0.303, + "step": 466 + }, + { + "epoch": 0.6003535272376668, + "grad_norm": 0.052839163690805435, + "learning_rate": 2e-05, + "loss": 0.2798, + "step": 467 + }, + { + "epoch": 0.601639080829182, + "grad_norm": 0.03605001047253609, + "learning_rate": 2e-05, + "loss": 0.1783, + "step": 468 + }, + { + "epoch": 0.6029246344206974, + "grad_norm": 0.03640325739979744, + "learning_rate": 2e-05, + "loss": 0.2498, + "step": 469 + }, + { + "epoch": 0.6042101880122127, + "grad_norm": 0.03874512016773224, + "learning_rate": 2e-05, + "loss": 0.1996, + "step": 470 + }, + { + "epoch": 0.6054957416037281, + "grad_norm": 0.03477559611201286, + "learning_rate": 2e-05, + "loss": 0.2121, + "step": 471 + }, + { + "epoch": 0.6067812951952435, + "grad_norm": 0.04953417927026749, + "learning_rate": 2e-05, + "loss": 0.2821, + "step": 472 + }, + { + "epoch": 0.6080668487867588, + "grad_norm": 0.04992024600505829, + "learning_rate": 2e-05, + "loss": 0.362, + "step": 473 + }, + { + "epoch": 0.6093524023782741, + "grad_norm": 0.048429060727357864, + "learning_rate": 2e-05, + "loss": 0.2217, + "step": 474 + }, + { + "epoch": 0.6106379559697895, + "grad_norm": 0.05344587191939354, + "learning_rate": 2e-05, + "loss": 0.2989, + "step": 475 + }, + { + "epoch": 0.6119235095613048, + "grad_norm": 0.04274825379252434, + "learning_rate": 2e-05, + "loss": 0.2424, + "step": 476 + }, + { + "epoch": 0.6132090631528202, + "grad_norm": 0.04651128128170967, + "learning_rate": 2e-05, + "loss": 0.3412, + "step": 477 + }, + { + "epoch": 0.6144946167443355, + "grad_norm": 0.05821945145726204, + "learning_rate": 2e-05, + "loss": 0.2726, + "step": 478 + }, + { + "epoch": 0.6157801703358509, + "grad_norm": 0.0519278421998024, + "learning_rate": 2e-05, + "loss": 0.2927, + "step": 479 + }, + { + "epoch": 0.6170657239273662, + "grad_norm": 0.03331352025270462, + "learning_rate": 2e-05, + "loss": 0.1688, + "step": 480 + }, + { + "epoch": 0.6183512775188815, + "grad_norm": 0.04451346397399902, + "learning_rate": 2e-05, + "loss": 0.213, + "step": 481 + }, + { + "epoch": 0.6196368311103969, + "grad_norm": 0.04776597023010254, + "learning_rate": 2e-05, + "loss": 0.2826, + "step": 482 + }, + { + "epoch": 0.6209223847019123, + "grad_norm": 0.0488264262676239, + "learning_rate": 2e-05, + "loss": 0.2886, + "step": 483 + }, + { + "epoch": 0.6222079382934276, + "grad_norm": 0.04393550381064415, + "learning_rate": 2e-05, + "loss": 0.1949, + "step": 484 + }, + { + "epoch": 0.623493491884943, + "grad_norm": 0.050872016698122025, + "learning_rate": 2e-05, + "loss": 0.2201, + "step": 485 + }, + { + "epoch": 0.6247790454764584, + "grad_norm": 0.06177595257759094, + "learning_rate": 2e-05, + "loss": 0.318, + "step": 486 + }, + { + "epoch": 0.6260645990679736, + "grad_norm": 0.03842415288090706, + "learning_rate": 2e-05, + "loss": 0.1763, + "step": 487 + }, + { + "epoch": 0.627350152659489, + "grad_norm": 0.04788699373602867, + "learning_rate": 2e-05, + "loss": 0.247, + "step": 488 + }, + { + "epoch": 0.6286357062510043, + "grad_norm": 0.05789102241396904, + "learning_rate": 2e-05, + "loss": 0.3338, + "step": 489 + }, + { + "epoch": 0.6299212598425197, + "grad_norm": 0.04298072308301926, + "learning_rate": 2e-05, + "loss": 0.2321, + "step": 490 + }, + { + "epoch": 0.6312068134340351, + "grad_norm": 0.03914102911949158, + "learning_rate": 2e-05, + "loss": 0.2341, + "step": 491 + }, + { + "epoch": 0.6324923670255503, + "grad_norm": 0.04699448123574257, + "learning_rate": 2e-05, + "loss": 0.2462, + "step": 492 + }, + { + "epoch": 0.6337779206170657, + "grad_norm": 0.04092938452959061, + "learning_rate": 2e-05, + "loss": 0.2378, + "step": 493 + }, + { + "epoch": 0.6350634742085811, + "grad_norm": 0.0463721826672554, + "learning_rate": 2e-05, + "loss": 0.225, + "step": 494 + }, + { + "epoch": 0.6363490278000964, + "grad_norm": 0.0489421971142292, + "learning_rate": 2e-05, + "loss": 0.2341, + "step": 495 + }, + { + "epoch": 0.6376345813916118, + "grad_norm": 0.04278067871928215, + "learning_rate": 2e-05, + "loss": 0.234, + "step": 496 + }, + { + "epoch": 0.6389201349831272, + "grad_norm": 0.04674089327454567, + "learning_rate": 2e-05, + "loss": 0.2755, + "step": 497 + }, + { + "epoch": 0.6402056885746424, + "grad_norm": 0.056766536086797714, + "learning_rate": 2e-05, + "loss": 0.3656, + "step": 498 + }, + { + "epoch": 0.6414912421661578, + "grad_norm": 0.04216759279370308, + "learning_rate": 2e-05, + "loss": 0.2931, + "step": 499 + }, + { + "epoch": 0.6427767957576731, + "grad_norm": 0.04742797464132309, + "learning_rate": 2e-05, + "loss": 0.3386, + "step": 500 + }, + { + "epoch": 0.6440623493491885, + "grad_norm": 0.05907592922449112, + "learning_rate": 2e-05, + "loss": 0.3194, + "step": 501 + }, + { + "epoch": 0.6453479029407039, + "grad_norm": 0.047280214726924896, + "learning_rate": 2e-05, + "loss": 0.2993, + "step": 502 + }, + { + "epoch": 0.6466334565322192, + "grad_norm": 0.03869684040546417, + "learning_rate": 2e-05, + "loss": 0.158, + "step": 503 + }, + { + "epoch": 0.6479190101237345, + "grad_norm": 0.04897621273994446, + "learning_rate": 2e-05, + "loss": 0.1948, + "step": 504 + }, + { + "epoch": 0.6492045637152499, + "grad_norm": 0.055846258997917175, + "learning_rate": 2e-05, + "loss": 0.312, + "step": 505 + }, + { + "epoch": 0.6504901173067652, + "grad_norm": 0.04266876354813576, + "learning_rate": 2e-05, + "loss": 0.2377, + "step": 506 + }, + { + "epoch": 0.6517756708982806, + "grad_norm": 0.050029207020998, + "learning_rate": 2e-05, + "loss": 0.1797, + "step": 507 + }, + { + "epoch": 0.6530612244897959, + "grad_norm": 0.035082824528217316, + "learning_rate": 2e-05, + "loss": 0.1799, + "step": 508 + }, + { + "epoch": 0.6543467780813113, + "grad_norm": 0.04430130124092102, + "learning_rate": 2e-05, + "loss": 0.2327, + "step": 509 + }, + { + "epoch": 0.6556323316728266, + "grad_norm": 0.03854670003056526, + "learning_rate": 2e-05, + "loss": 0.2235, + "step": 510 + }, + { + "epoch": 0.6569178852643419, + "grad_norm": 0.04970936104655266, + "learning_rate": 2e-05, + "loss": 0.2818, + "step": 511 + }, + { + "epoch": 0.6582034388558573, + "grad_norm": 0.04700899496674538, + "learning_rate": 2e-05, + "loss": 0.2417, + "step": 512 + }, + { + "epoch": 0.6594889924473727, + "grad_norm": 0.04256317391991615, + "learning_rate": 2e-05, + "loss": 0.2703, + "step": 513 + }, + { + "epoch": 0.660774546038888, + "grad_norm": 0.04744260385632515, + "learning_rate": 2e-05, + "loss": 0.2048, + "step": 514 + }, + { + "epoch": 0.6620600996304034, + "grad_norm": 0.04310823976993561, + "learning_rate": 2e-05, + "loss": 0.1897, + "step": 515 + }, + { + "epoch": 0.6633456532219187, + "grad_norm": 0.04300684109330177, + "learning_rate": 2e-05, + "loss": 0.2139, + "step": 516 + }, + { + "epoch": 0.664631206813434, + "grad_norm": 0.05581510066986084, + "learning_rate": 2e-05, + "loss": 0.2616, + "step": 517 + }, + { + "epoch": 0.6659167604049494, + "grad_norm": 0.055505942553281784, + "learning_rate": 2e-05, + "loss": 0.2915, + "step": 518 + }, + { + "epoch": 0.6672023139964647, + "grad_norm": 0.040814101696014404, + "learning_rate": 2e-05, + "loss": 0.2187, + "step": 519 + }, + { + "epoch": 0.6684878675879801, + "grad_norm": 0.05864616110920906, + "learning_rate": 2e-05, + "loss": 0.3499, + "step": 520 + }, + { + "epoch": 0.6697734211794955, + "grad_norm": 0.057373858988285065, + "learning_rate": 2e-05, + "loss": 0.3538, + "step": 521 + }, + { + "epoch": 0.6710589747710107, + "grad_norm": 0.041141483932733536, + "learning_rate": 2e-05, + "loss": 0.2711, + "step": 522 + }, + { + "epoch": 0.6723445283625261, + "grad_norm": 0.03994324058294296, + "learning_rate": 2e-05, + "loss": 0.182, + "step": 523 + }, + { + "epoch": 0.6736300819540415, + "grad_norm": 0.04982011020183563, + "learning_rate": 2e-05, + "loss": 0.2911, + "step": 524 + }, + { + "epoch": 0.6749156355455568, + "grad_norm": 0.04852016270160675, + "learning_rate": 2e-05, + "loss": 0.254, + "step": 525 + }, + { + "epoch": 0.6762011891370722, + "grad_norm": 0.05752996355295181, + "learning_rate": 2e-05, + "loss": 0.2969, + "step": 526 + }, + { + "epoch": 0.6774867427285874, + "grad_norm": 0.04058138653635979, + "learning_rate": 2e-05, + "loss": 0.1861, + "step": 527 + }, + { + "epoch": 0.6787722963201028, + "grad_norm": 0.05575535446405411, + "learning_rate": 2e-05, + "loss": 0.3174, + "step": 528 + }, + { + "epoch": 0.6800578499116182, + "grad_norm": 0.0468176007270813, + "learning_rate": 2e-05, + "loss": 0.2699, + "step": 529 + }, + { + "epoch": 0.6813434035031335, + "grad_norm": 0.054678115993738174, + "learning_rate": 2e-05, + "loss": 0.3051, + "step": 530 + }, + { + "epoch": 0.6826289570946489, + "grad_norm": 0.055189572274684906, + "learning_rate": 2e-05, + "loss": 0.2397, + "step": 531 + }, + { + "epoch": 0.6839145106861643, + "grad_norm": 0.048087868839502335, + "learning_rate": 2e-05, + "loss": 0.2302, + "step": 532 + }, + { + "epoch": 0.6852000642776795, + "grad_norm": 0.057727813720703125, + "learning_rate": 2e-05, + "loss": 0.2457, + "step": 533 + }, + { + "epoch": 0.6864856178691949, + "grad_norm": 0.04846923425793648, + "learning_rate": 2e-05, + "loss": 0.2506, + "step": 534 + }, + { + "epoch": 0.6877711714607103, + "grad_norm": 0.0410042330622673, + "learning_rate": 2e-05, + "loss": 0.198, + "step": 535 + }, + { + "epoch": 0.6890567250522256, + "grad_norm": 0.05333555117249489, + "learning_rate": 2e-05, + "loss": 0.283, + "step": 536 + }, + { + "epoch": 0.690342278643741, + "grad_norm": 0.05376364290714264, + "learning_rate": 2e-05, + "loss": 0.3337, + "step": 537 + }, + { + "epoch": 0.6916278322352563, + "grad_norm": 0.04879291355609894, + "learning_rate": 2e-05, + "loss": 0.3075, + "step": 538 + }, + { + "epoch": 0.6929133858267716, + "grad_norm": 0.0375969335436821, + "learning_rate": 2e-05, + "loss": 0.1652, + "step": 539 + }, + { + "epoch": 0.694198939418287, + "grad_norm": 0.042424045503139496, + "learning_rate": 2e-05, + "loss": 0.2398, + "step": 540 + }, + { + "epoch": 0.6954844930098023, + "grad_norm": 0.048496536910533905, + "learning_rate": 2e-05, + "loss": 0.239, + "step": 541 + }, + { + "epoch": 0.6967700466013177, + "grad_norm": 0.04180686175823212, + "learning_rate": 2e-05, + "loss": 0.2521, + "step": 542 + }, + { + "epoch": 0.6980556001928331, + "grad_norm": 0.046767883002758026, + "learning_rate": 2e-05, + "loss": 0.2113, + "step": 543 + }, + { + "epoch": 0.6993411537843484, + "grad_norm": 0.05949412286281586, + "learning_rate": 2e-05, + "loss": 0.3443, + "step": 544 + }, + { + "epoch": 0.7006267073758637, + "grad_norm": 0.04437008500099182, + "learning_rate": 2e-05, + "loss": 0.2244, + "step": 545 + }, + { + "epoch": 0.7019122609673791, + "grad_norm": 0.04240270331501961, + "learning_rate": 2e-05, + "loss": 0.2239, + "step": 546 + }, + { + "epoch": 0.7031978145588944, + "grad_norm": 0.04866647720336914, + "learning_rate": 2e-05, + "loss": 0.2846, + "step": 547 + }, + { + "epoch": 0.7044833681504098, + "grad_norm": 0.04255237057805061, + "learning_rate": 2e-05, + "loss": 0.1759, + "step": 548 + }, + { + "epoch": 0.7057689217419251, + "grad_norm": 0.04113907366991043, + "learning_rate": 2e-05, + "loss": 0.2481, + "step": 549 + }, + { + "epoch": 0.7070544753334405, + "grad_norm": 0.04230246692895889, + "learning_rate": 2e-05, + "loss": 0.1963, + "step": 550 + }, + { + "epoch": 0.7083400289249558, + "grad_norm": 0.05263131856918335, + "learning_rate": 2e-05, + "loss": 0.2355, + "step": 551 + }, + { + "epoch": 0.7096255825164711, + "grad_norm": 0.041025299578905106, + "learning_rate": 2e-05, + "loss": 0.193, + "step": 552 + }, + { + "epoch": 0.7109111361079865, + "grad_norm": 0.048196010291576385, + "learning_rate": 2e-05, + "loss": 0.2183, + "step": 553 + }, + { + "epoch": 0.7121966896995019, + "grad_norm": 0.05287821963429451, + "learning_rate": 2e-05, + "loss": 0.2969, + "step": 554 + }, + { + "epoch": 0.7134822432910172, + "grad_norm": 0.04392276331782341, + "learning_rate": 2e-05, + "loss": 0.2029, + "step": 555 + }, + { + "epoch": 0.7147677968825326, + "grad_norm": 0.05237026512622833, + "learning_rate": 2e-05, + "loss": 0.2653, + "step": 556 + }, + { + "epoch": 0.7160533504740478, + "grad_norm": 0.05913091078400612, + "learning_rate": 2e-05, + "loss": 0.2944, + "step": 557 + }, + { + "epoch": 0.7173389040655632, + "grad_norm": 0.04113471135497093, + "learning_rate": 2e-05, + "loss": 0.2411, + "step": 558 + }, + { + "epoch": 0.7186244576570786, + "grad_norm": 0.040105462074279785, + "learning_rate": 2e-05, + "loss": 0.1857, + "step": 559 + }, + { + "epoch": 0.7199100112485939, + "grad_norm": 0.058607831597328186, + "learning_rate": 2e-05, + "loss": 0.1984, + "step": 560 + }, + { + "epoch": 0.7211955648401093, + "grad_norm": 0.043256357312202454, + "learning_rate": 2e-05, + "loss": 0.2584, + "step": 561 + }, + { + "epoch": 0.7224811184316247, + "grad_norm": 0.05908385291695595, + "learning_rate": 2e-05, + "loss": 0.33, + "step": 562 + }, + { + "epoch": 0.7237666720231399, + "grad_norm": 0.050697483122348785, + "learning_rate": 2e-05, + "loss": 0.242, + "step": 563 + }, + { + "epoch": 0.7250522256146553, + "grad_norm": 0.05611984431743622, + "learning_rate": 2e-05, + "loss": 0.3334, + "step": 564 + }, + { + "epoch": 0.7263377792061707, + "grad_norm": 0.05749541521072388, + "learning_rate": 2e-05, + "loss": 0.2454, + "step": 565 + }, + { + "epoch": 0.727623332797686, + "grad_norm": 0.05453288555145264, + "learning_rate": 2e-05, + "loss": 0.249, + "step": 566 + }, + { + "epoch": 0.7289088863892014, + "grad_norm": 0.061655569821596146, + "learning_rate": 2e-05, + "loss": 0.2954, + "step": 567 + }, + { + "epoch": 0.7301944399807166, + "grad_norm": 0.051404744386672974, + "learning_rate": 2e-05, + "loss": 0.2356, + "step": 568 + }, + { + "epoch": 0.731479993572232, + "grad_norm": 0.04265725240111351, + "learning_rate": 2e-05, + "loss": 0.1842, + "step": 569 + }, + { + "epoch": 0.7327655471637474, + "grad_norm": 0.06363217532634735, + "learning_rate": 2e-05, + "loss": 0.3187, + "step": 570 + }, + { + "epoch": 0.7340511007552627, + "grad_norm": 0.04742373526096344, + "learning_rate": 2e-05, + "loss": 0.2286, + "step": 571 + }, + { + "epoch": 0.7353366543467781, + "grad_norm": 0.05723915994167328, + "learning_rate": 2e-05, + "loss": 0.3183, + "step": 572 + }, + { + "epoch": 0.7366222079382935, + "grad_norm": 0.04636276140809059, + "learning_rate": 2e-05, + "loss": 0.2172, + "step": 573 + }, + { + "epoch": 0.7379077615298087, + "grad_norm": 0.041882552206516266, + "learning_rate": 2e-05, + "loss": 0.195, + "step": 574 + }, + { + "epoch": 0.7391933151213241, + "grad_norm": 0.05022399127483368, + "learning_rate": 2e-05, + "loss": 0.2564, + "step": 575 + }, + { + "epoch": 0.7404788687128395, + "grad_norm": 0.058215439319610596, + "learning_rate": 2e-05, + "loss": 0.3047, + "step": 576 + }, + { + "epoch": 0.7417644223043548, + "grad_norm": 0.04993325099349022, + "learning_rate": 2e-05, + "loss": 0.1955, + "step": 577 + }, + { + "epoch": 0.7430499758958702, + "grad_norm": 0.05288231745362282, + "learning_rate": 2e-05, + "loss": 0.3005, + "step": 578 + }, + { + "epoch": 0.7443355294873855, + "grad_norm": 0.055686481297016144, + "learning_rate": 2e-05, + "loss": 0.3304, + "step": 579 + }, + { + "epoch": 0.7456210830789008, + "grad_norm": 0.06084279343485832, + "learning_rate": 2e-05, + "loss": 0.3377, + "step": 580 + }, + { + "epoch": 0.7469066366704162, + "grad_norm": 0.041104961186647415, + "learning_rate": 2e-05, + "loss": 0.2019, + "step": 581 + }, + { + "epoch": 0.7481921902619315, + "grad_norm": 0.04409842938184738, + "learning_rate": 2e-05, + "loss": 0.2383, + "step": 582 + }, + { + "epoch": 0.7494777438534469, + "grad_norm": 0.050962381064891815, + "learning_rate": 2e-05, + "loss": 0.2439, + "step": 583 + }, + { + "epoch": 0.7507632974449623, + "grad_norm": 0.05231870710849762, + "learning_rate": 2e-05, + "loss": 0.2337, + "step": 584 + }, + { + "epoch": 0.7520488510364776, + "grad_norm": 0.04085131362080574, + "learning_rate": 2e-05, + "loss": 0.1451, + "step": 585 + }, + { + "epoch": 0.753334404627993, + "grad_norm": 0.04120944067835808, + "learning_rate": 2e-05, + "loss": 0.2029, + "step": 586 + }, + { + "epoch": 0.7546199582195082, + "grad_norm": 0.0363801047205925, + "learning_rate": 2e-05, + "loss": 0.1393, + "step": 587 + }, + { + "epoch": 0.7559055118110236, + "grad_norm": 0.04919865354895592, + "learning_rate": 2e-05, + "loss": 0.2308, + "step": 588 + }, + { + "epoch": 0.757191065402539, + "grad_norm": 0.0516657792031765, + "learning_rate": 2e-05, + "loss": 0.3006, + "step": 589 + }, + { + "epoch": 0.7584766189940543, + "grad_norm": 0.07350458204746246, + "learning_rate": 2e-05, + "loss": 0.3796, + "step": 590 + }, + { + "epoch": 0.7597621725855697, + "grad_norm": 0.05353572219610214, + "learning_rate": 2e-05, + "loss": 0.2548, + "step": 591 + }, + { + "epoch": 0.761047726177085, + "grad_norm": 0.04492725431919098, + "learning_rate": 2e-05, + "loss": 0.199, + "step": 592 + }, + { + "epoch": 0.7623332797686003, + "grad_norm": 0.04892539232969284, + "learning_rate": 2e-05, + "loss": 0.2108, + "step": 593 + }, + { + "epoch": 0.7636188333601157, + "grad_norm": 0.03860924020409584, + "learning_rate": 2e-05, + "loss": 0.1896, + "step": 594 + }, + { + "epoch": 0.7649043869516311, + "grad_norm": 0.052807312458753586, + "learning_rate": 2e-05, + "loss": 0.2709, + "step": 595 + }, + { + "epoch": 0.7661899405431464, + "grad_norm": 0.04871145263314247, + "learning_rate": 2e-05, + "loss": 0.2779, + "step": 596 + }, + { + "epoch": 0.7674754941346618, + "grad_norm": 0.04021324962377548, + "learning_rate": 2e-05, + "loss": 0.2136, + "step": 597 + }, + { + "epoch": 0.768761047726177, + "grad_norm": 0.050265613943338394, + "learning_rate": 2e-05, + "loss": 0.2601, + "step": 598 + }, + { + "epoch": 0.7700466013176924, + "grad_norm": 0.03576774150133133, + "learning_rate": 2e-05, + "loss": 0.2114, + "step": 599 + }, + { + "epoch": 0.7713321549092078, + "grad_norm": 0.055398743599653244, + "learning_rate": 2e-05, + "loss": 0.2701, + "step": 600 + }, + { + "epoch": 0.7726177085007231, + "grad_norm": 0.06506812572479248, + "learning_rate": 2e-05, + "loss": 0.3518, + "step": 601 + }, + { + "epoch": 0.7739032620922385, + "grad_norm": 0.037148088216781616, + "learning_rate": 2e-05, + "loss": 0.1438, + "step": 602 + }, + { + "epoch": 0.7751888156837539, + "grad_norm": 0.046173613518476486, + "learning_rate": 2e-05, + "loss": 0.2294, + "step": 603 + }, + { + "epoch": 0.7764743692752691, + "grad_norm": 0.06617863476276398, + "learning_rate": 2e-05, + "loss": 0.2888, + "step": 604 + }, + { + "epoch": 0.7777599228667845, + "grad_norm": 0.051207173615694046, + "learning_rate": 2e-05, + "loss": 0.2624, + "step": 605 + }, + { + "epoch": 0.7790454764582998, + "grad_norm": 0.041766516864299774, + "learning_rate": 2e-05, + "loss": 0.1881, + "step": 606 + }, + { + "epoch": 0.7803310300498152, + "grad_norm": 0.05160610005259514, + "learning_rate": 2e-05, + "loss": 0.258, + "step": 607 + }, + { + "epoch": 0.7816165836413306, + "grad_norm": 0.04584109038114548, + "learning_rate": 2e-05, + "loss": 0.2087, + "step": 608 + }, + { + "epoch": 0.7829021372328459, + "grad_norm": 0.04200456291437149, + "learning_rate": 2e-05, + "loss": 0.2036, + "step": 609 + }, + { + "epoch": 0.7841876908243612, + "grad_norm": 0.039162181317806244, + "learning_rate": 2e-05, + "loss": 0.1833, + "step": 610 + }, + { + "epoch": 0.7854732444158766, + "grad_norm": 0.041861940175294876, + "learning_rate": 2e-05, + "loss": 0.1623, + "step": 611 + }, + { + "epoch": 0.7867587980073919, + "grad_norm": 0.05622352659702301, + "learning_rate": 2e-05, + "loss": 0.3556, + "step": 612 + }, + { + "epoch": 0.7880443515989073, + "grad_norm": 0.048621952533721924, + "learning_rate": 2e-05, + "loss": 0.2211, + "step": 613 + }, + { + "epoch": 0.7893299051904227, + "grad_norm": 0.0437590628862381, + "learning_rate": 2e-05, + "loss": 0.2015, + "step": 614 + }, + { + "epoch": 0.790615458781938, + "grad_norm": 0.05675414949655533, + "learning_rate": 2e-05, + "loss": 0.2416, + "step": 615 + }, + { + "epoch": 0.7919010123734533, + "grad_norm": 0.03869640827178955, + "learning_rate": 2e-05, + "loss": 0.1655, + "step": 616 + }, + { + "epoch": 0.7931865659649686, + "grad_norm": 0.04821722209453583, + "learning_rate": 2e-05, + "loss": 0.1902, + "step": 617 + }, + { + "epoch": 0.794472119556484, + "grad_norm": 0.04423803463578224, + "learning_rate": 2e-05, + "loss": 0.157, + "step": 618 + }, + { + "epoch": 0.7957576731479994, + "grad_norm": 0.04364867880940437, + "learning_rate": 2e-05, + "loss": 0.2406, + "step": 619 + }, + { + "epoch": 0.7970432267395147, + "grad_norm": 0.059711892157793045, + "learning_rate": 2e-05, + "loss": 0.2981, + "step": 620 + }, + { + "epoch": 0.79832878033103, + "grad_norm": 0.046063173562288284, + "learning_rate": 2e-05, + "loss": 0.2184, + "step": 621 + }, + { + "epoch": 0.7996143339225454, + "grad_norm": 0.06073896959424019, + "learning_rate": 2e-05, + "loss": 0.2351, + "step": 622 + }, + { + "epoch": 0.8008998875140607, + "grad_norm": 0.039248064160346985, + "learning_rate": 2e-05, + "loss": 0.1888, + "step": 623 + }, + { + "epoch": 0.8021854411055761, + "grad_norm": 0.05402129143476486, + "learning_rate": 2e-05, + "loss": 0.3368, + "step": 624 + }, + { + "epoch": 0.8034709946970915, + "grad_norm": 0.04230786859989166, + "learning_rate": 2e-05, + "loss": 0.1748, + "step": 625 + }, + { + "epoch": 0.8047565482886068, + "grad_norm": 0.06045274809002876, + "learning_rate": 2e-05, + "loss": 0.3958, + "step": 626 + }, + { + "epoch": 0.8060421018801222, + "grad_norm": 0.04717743769288063, + "learning_rate": 2e-05, + "loss": 0.2704, + "step": 627 + }, + { + "epoch": 0.8073276554716374, + "grad_norm": 0.04878292232751846, + "learning_rate": 2e-05, + "loss": 0.2412, + "step": 628 + }, + { + "epoch": 0.8086132090631528, + "grad_norm": 0.038947124034166336, + "learning_rate": 2e-05, + "loss": 0.2169, + "step": 629 + }, + { + "epoch": 0.8098987626546682, + "grad_norm": 0.0614759586751461, + "learning_rate": 2e-05, + "loss": 0.3013, + "step": 630 + }, + { + "epoch": 0.8111843162461835, + "grad_norm": 0.06246621906757355, + "learning_rate": 2e-05, + "loss": 0.2947, + "step": 631 + }, + { + "epoch": 0.8124698698376989, + "grad_norm": 0.06976212561130524, + "learning_rate": 2e-05, + "loss": 0.297, + "step": 632 + }, + { + "epoch": 0.8137554234292143, + "grad_norm": 0.03317941352725029, + "learning_rate": 2e-05, + "loss": 0.1375, + "step": 633 + }, + { + "epoch": 0.8150409770207295, + "grad_norm": 0.06765579432249069, + "learning_rate": 2e-05, + "loss": 0.258, + "step": 634 + }, + { + "epoch": 0.8163265306122449, + "grad_norm": 0.06797792762517929, + "learning_rate": 2e-05, + "loss": 0.256, + "step": 635 + }, + { + "epoch": 0.8176120842037602, + "grad_norm": 0.059785496443510056, + "learning_rate": 2e-05, + "loss": 0.3343, + "step": 636 + }, + { + "epoch": 0.8188976377952756, + "grad_norm": 0.059780728071928024, + "learning_rate": 2e-05, + "loss": 0.3634, + "step": 637 + }, + { + "epoch": 0.820183191386791, + "grad_norm": 0.04111599549651146, + "learning_rate": 2e-05, + "loss": 0.2011, + "step": 638 + }, + { + "epoch": 0.8214687449783062, + "grad_norm": 0.04656028002500534, + "learning_rate": 2e-05, + "loss": 0.2214, + "step": 639 + }, + { + "epoch": 0.8227542985698216, + "grad_norm": 0.054362326860427856, + "learning_rate": 2e-05, + "loss": 0.2928, + "step": 640 + }, + { + "epoch": 0.824039852161337, + "grad_norm": 0.04594152048230171, + "learning_rate": 2e-05, + "loss": 0.2285, + "step": 641 + }, + { + "epoch": 0.8253254057528523, + "grad_norm": 0.056715745478868484, + "learning_rate": 2e-05, + "loss": 0.2531, + "step": 642 + }, + { + "epoch": 0.8266109593443677, + "grad_norm": 0.049057237803936005, + "learning_rate": 2e-05, + "loss": 0.1749, + "step": 643 + }, + { + "epoch": 0.8278965129358831, + "grad_norm": 0.05435045436024666, + "learning_rate": 2e-05, + "loss": 0.2796, + "step": 644 + }, + { + "epoch": 0.8291820665273983, + "grad_norm": 0.049284275621175766, + "learning_rate": 2e-05, + "loss": 0.2381, + "step": 645 + }, + { + "epoch": 0.8304676201189137, + "grad_norm": 0.044050633907318115, + "learning_rate": 2e-05, + "loss": 0.2804, + "step": 646 + }, + { + "epoch": 0.831753173710429, + "grad_norm": 0.054185982793569565, + "learning_rate": 2e-05, + "loss": 0.2617, + "step": 647 + }, + { + "epoch": 0.8330387273019444, + "grad_norm": 0.0534062534570694, + "learning_rate": 2e-05, + "loss": 0.2502, + "step": 648 + }, + { + "epoch": 0.8343242808934598, + "grad_norm": 0.06242300197482109, + "learning_rate": 2e-05, + "loss": 0.2662, + "step": 649 + }, + { + "epoch": 0.8356098344849751, + "grad_norm": 0.0385594442486763, + "learning_rate": 2e-05, + "loss": 0.1897, + "step": 650 + }, + { + "epoch": 0.8368953880764904, + "grad_norm": 0.065641388297081, + "learning_rate": 2e-05, + "loss": 0.3179, + "step": 651 + }, + { + "epoch": 0.8381809416680058, + "grad_norm": 0.054985061287879944, + "learning_rate": 2e-05, + "loss": 0.222, + "step": 652 + }, + { + "epoch": 0.8394664952595211, + "grad_norm": 0.05766449496150017, + "learning_rate": 2e-05, + "loss": 0.289, + "step": 653 + }, + { + "epoch": 0.8407520488510365, + "grad_norm": 0.04635515809059143, + "learning_rate": 2e-05, + "loss": 0.2464, + "step": 654 + }, + { + "epoch": 0.8420376024425518, + "grad_norm": 0.0583229660987854, + "learning_rate": 2e-05, + "loss": 0.2436, + "step": 655 + }, + { + "epoch": 0.8433231560340672, + "grad_norm": 0.04983345419168472, + "learning_rate": 2e-05, + "loss": 0.2534, + "step": 656 + }, + { + "epoch": 0.8446087096255825, + "grad_norm": 0.04292474314570427, + "learning_rate": 2e-05, + "loss": 0.1772, + "step": 657 + }, + { + "epoch": 0.8458942632170978, + "grad_norm": 0.05735989660024643, + "learning_rate": 2e-05, + "loss": 0.267, + "step": 658 + }, + { + "epoch": 0.8471798168086132, + "grad_norm": 0.055415477603673935, + "learning_rate": 2e-05, + "loss": 0.2651, + "step": 659 + }, + { + "epoch": 0.8484653704001286, + "grad_norm": 0.052020199596881866, + "learning_rate": 2e-05, + "loss": 0.2177, + "step": 660 + }, + { + "epoch": 0.8497509239916439, + "grad_norm": 0.05934329703450203, + "learning_rate": 2e-05, + "loss": 0.2665, + "step": 661 + }, + { + "epoch": 0.8510364775831593, + "grad_norm": 0.06611707806587219, + "learning_rate": 2e-05, + "loss": 0.3774, + "step": 662 + }, + { + "epoch": 0.8523220311746746, + "grad_norm": 0.05337178707122803, + "learning_rate": 2e-05, + "loss": 0.2699, + "step": 663 + }, + { + "epoch": 0.8536075847661899, + "grad_norm": 0.05552757531404495, + "learning_rate": 2e-05, + "loss": 0.2204, + "step": 664 + }, + { + "epoch": 0.8548931383577053, + "grad_norm": 0.051326069980859756, + "learning_rate": 2e-05, + "loss": 0.1791, + "step": 665 + }, + { + "epoch": 0.8561786919492206, + "grad_norm": 0.04780028760433197, + "learning_rate": 2e-05, + "loss": 0.1959, + "step": 666 + }, + { + "epoch": 0.857464245540736, + "grad_norm": 0.06344909965991974, + "learning_rate": 2e-05, + "loss": 0.2809, + "step": 667 + }, + { + "epoch": 0.8587497991322514, + "grad_norm": 0.0526767373085022, + "learning_rate": 2e-05, + "loss": 0.2547, + "step": 668 + }, + { + "epoch": 0.8600353527237666, + "grad_norm": 0.04369194433093071, + "learning_rate": 2e-05, + "loss": 0.233, + "step": 669 + }, + { + "epoch": 0.861320906315282, + "grad_norm": 0.05023709312081337, + "learning_rate": 2e-05, + "loss": 0.2576, + "step": 670 + }, + { + "epoch": 0.8626064599067974, + "grad_norm": 0.06402754783630371, + "learning_rate": 2e-05, + "loss": 0.2579, + "step": 671 + }, + { + "epoch": 0.8638920134983127, + "grad_norm": 0.06747744977474213, + "learning_rate": 2e-05, + "loss": 0.393, + "step": 672 + }, + { + "epoch": 0.8651775670898281, + "grad_norm": 0.06799997389316559, + "learning_rate": 2e-05, + "loss": 0.3114, + "step": 673 + }, + { + "epoch": 0.8664631206813435, + "grad_norm": 0.044738415628671646, + "learning_rate": 2e-05, + "loss": 0.222, + "step": 674 + }, + { + "epoch": 0.8677486742728587, + "grad_norm": 0.05913526564836502, + "learning_rate": 2e-05, + "loss": 0.2701, + "step": 675 + }, + { + "epoch": 0.8690342278643741, + "grad_norm": 0.052639495581388474, + "learning_rate": 2e-05, + "loss": 0.2279, + "step": 676 + }, + { + "epoch": 0.8703197814558894, + "grad_norm": 0.0436641164124012, + "learning_rate": 2e-05, + "loss": 0.1722, + "step": 677 + }, + { + "epoch": 0.8716053350474048, + "grad_norm": 0.06275106966495514, + "learning_rate": 2e-05, + "loss": 0.3289, + "step": 678 + }, + { + "epoch": 0.8728908886389202, + "grad_norm": 0.034002162516117096, + "learning_rate": 2e-05, + "loss": 0.1262, + "step": 679 + }, + { + "epoch": 0.8741764422304354, + "grad_norm": 0.04524555802345276, + "learning_rate": 2e-05, + "loss": 0.1765, + "step": 680 + }, + { + "epoch": 0.8754619958219508, + "grad_norm": 0.04776989668607712, + "learning_rate": 2e-05, + "loss": 0.242, + "step": 681 + }, + { + "epoch": 0.8767475494134662, + "grad_norm": 0.060143712908029556, + "learning_rate": 2e-05, + "loss": 0.24, + "step": 682 + }, + { + "epoch": 0.8780331030049815, + "grad_norm": 0.06363454461097717, + "learning_rate": 2e-05, + "loss": 0.3104, + "step": 683 + }, + { + "epoch": 0.8793186565964969, + "grad_norm": 0.05736486613750458, + "learning_rate": 2e-05, + "loss": 0.3299, + "step": 684 + }, + { + "epoch": 0.8806042101880122, + "grad_norm": 0.048391181975603104, + "learning_rate": 2e-05, + "loss": 0.1937, + "step": 685 + }, + { + "epoch": 0.8818897637795275, + "grad_norm": 0.047165125608444214, + "learning_rate": 2e-05, + "loss": 0.2608, + "step": 686 + }, + { + "epoch": 0.8831753173710429, + "grad_norm": 0.061681345105171204, + "learning_rate": 2e-05, + "loss": 0.2948, + "step": 687 + }, + { + "epoch": 0.8844608709625582, + "grad_norm": 0.060136910527944565, + "learning_rate": 2e-05, + "loss": 0.2272, + "step": 688 + }, + { + "epoch": 0.8857464245540736, + "grad_norm": 0.047498807311058044, + "learning_rate": 2e-05, + "loss": 0.1813, + "step": 689 + }, + { + "epoch": 0.887031978145589, + "grad_norm": 0.06447866559028625, + "learning_rate": 2e-05, + "loss": 0.2808, + "step": 690 + }, + { + "epoch": 0.8883175317371043, + "grad_norm": 0.05992686748504639, + "learning_rate": 2e-05, + "loss": 0.262, + "step": 691 + }, + { + "epoch": 0.8896030853286196, + "grad_norm": 0.048196423798799515, + "learning_rate": 2e-05, + "loss": 0.2238, + "step": 692 + }, + { + "epoch": 0.890888638920135, + "grad_norm": 0.06860709935426712, + "learning_rate": 2e-05, + "loss": 0.2679, + "step": 693 + }, + { + "epoch": 0.8921741925116503, + "grad_norm": 0.05085690692067146, + "learning_rate": 2e-05, + "loss": 0.2948, + "step": 694 + }, + { + "epoch": 0.8934597461031657, + "grad_norm": 0.06869999319314957, + "learning_rate": 2e-05, + "loss": 0.2961, + "step": 695 + }, + { + "epoch": 0.894745299694681, + "grad_norm": 0.04691535234451294, + "learning_rate": 2e-05, + "loss": 0.2019, + "step": 696 + }, + { + "epoch": 0.8960308532861964, + "grad_norm": 0.04785510525107384, + "learning_rate": 2e-05, + "loss": 0.147, + "step": 697 + }, + { + "epoch": 0.8973164068777117, + "grad_norm": 0.06156083196401596, + "learning_rate": 2e-05, + "loss": 0.2215, + "step": 698 + }, + { + "epoch": 0.898601960469227, + "grad_norm": 0.051647745072841644, + "learning_rate": 2e-05, + "loss": 0.2252, + "step": 699 + }, + { + "epoch": 0.8998875140607424, + "grad_norm": 0.04751814156770706, + "learning_rate": 2e-05, + "loss": 0.2482, + "step": 700 + }, + { + "epoch": 0.9011730676522578, + "grad_norm": 0.05452054366469383, + "learning_rate": 2e-05, + "loss": 0.2138, + "step": 701 + }, + { + "epoch": 0.9024586212437731, + "grad_norm": 0.045277033001184464, + "learning_rate": 2e-05, + "loss": 0.2148, + "step": 702 + }, + { + "epoch": 0.9037441748352885, + "grad_norm": 0.045462466776371, + "learning_rate": 2e-05, + "loss": 0.1711, + "step": 703 + }, + { + "epoch": 0.9050297284268038, + "grad_norm": 0.06722573935985565, + "learning_rate": 2e-05, + "loss": 0.3205, + "step": 704 + }, + { + "epoch": 0.9063152820183191, + "grad_norm": 0.05163208395242691, + "learning_rate": 2e-05, + "loss": 0.247, + "step": 705 + }, + { + "epoch": 0.9076008356098345, + "grad_norm": 0.052614837884902954, + "learning_rate": 2e-05, + "loss": 0.2002, + "step": 706 + }, + { + "epoch": 0.9088863892013498, + "grad_norm": 0.03826769068837166, + "learning_rate": 2e-05, + "loss": 0.1744, + "step": 707 + }, + { + "epoch": 0.9101719427928652, + "grad_norm": 0.04780410975217819, + "learning_rate": 2e-05, + "loss": 0.2524, + "step": 708 + }, + { + "epoch": 0.9114574963843806, + "grad_norm": 0.03547963872551918, + "learning_rate": 2e-05, + "loss": 0.1674, + "step": 709 + }, + { + "epoch": 0.9127430499758958, + "grad_norm": 0.0573282465338707, + "learning_rate": 2e-05, + "loss": 0.2749, + "step": 710 + }, + { + "epoch": 0.9140286035674112, + "grad_norm": 0.0570538304746151, + "learning_rate": 2e-05, + "loss": 0.2412, + "step": 711 + }, + { + "epoch": 0.9153141571589266, + "grad_norm": 0.054683949798345566, + "learning_rate": 2e-05, + "loss": 0.2537, + "step": 712 + }, + { + "epoch": 0.9165997107504419, + "grad_norm": 0.05413772165775299, + "learning_rate": 2e-05, + "loss": 0.2314, + "step": 713 + }, + { + "epoch": 0.9178852643419573, + "grad_norm": 0.05124877020716667, + "learning_rate": 2e-05, + "loss": 0.2645, + "step": 714 + }, + { + "epoch": 0.9191708179334726, + "grad_norm": 0.06577921658754349, + "learning_rate": 2e-05, + "loss": 0.314, + "step": 715 + }, + { + "epoch": 0.9204563715249879, + "grad_norm": 0.05663186311721802, + "learning_rate": 2e-05, + "loss": 0.2422, + "step": 716 + }, + { + "epoch": 0.9217419251165033, + "grad_norm": 0.05851929262280464, + "learning_rate": 2e-05, + "loss": 0.2845, + "step": 717 + }, + { + "epoch": 0.9230274787080186, + "grad_norm": 0.06582541763782501, + "learning_rate": 2e-05, + "loss": 0.2487, + "step": 718 + }, + { + "epoch": 0.924313032299534, + "grad_norm": 0.0434844084084034, + "learning_rate": 2e-05, + "loss": 0.191, + "step": 719 + }, + { + "epoch": 0.9255985858910494, + "grad_norm": 0.056996386498212814, + "learning_rate": 2e-05, + "loss": 0.2733, + "step": 720 + }, + { + "epoch": 0.9268841394825647, + "grad_norm": 0.04399803280830383, + "learning_rate": 2e-05, + "loss": 0.1991, + "step": 721 + }, + { + "epoch": 0.92816969307408, + "grad_norm": 0.047656819224357605, + "learning_rate": 2e-05, + "loss": 0.2274, + "step": 722 + }, + { + "epoch": 0.9294552466655954, + "grad_norm": 0.0753135085105896, + "learning_rate": 2e-05, + "loss": 0.3748, + "step": 723 + }, + { + "epoch": 0.9307408002571107, + "grad_norm": 0.07544931024312973, + "learning_rate": 2e-05, + "loss": 0.282, + "step": 724 + }, + { + "epoch": 0.9320263538486261, + "grad_norm": 0.05577397346496582, + "learning_rate": 2e-05, + "loss": 0.273, + "step": 725 + }, + { + "epoch": 0.9333119074401414, + "grad_norm": 0.039960604161024094, + "learning_rate": 2e-05, + "loss": 0.1423, + "step": 726 + }, + { + "epoch": 0.9345974610316568, + "grad_norm": 0.0625922679901123, + "learning_rate": 2e-05, + "loss": 0.2504, + "step": 727 + }, + { + "epoch": 0.9358830146231721, + "grad_norm": 0.060125015676021576, + "learning_rate": 2e-05, + "loss": 0.2061, + "step": 728 + }, + { + "epoch": 0.9371685682146874, + "grad_norm": 0.06697895377874374, + "learning_rate": 2e-05, + "loss": 0.2672, + "step": 729 + }, + { + "epoch": 0.9384541218062028, + "grad_norm": 0.09079831093549728, + "learning_rate": 2e-05, + "loss": 0.3944, + "step": 730 + }, + { + "epoch": 0.9397396753977182, + "grad_norm": 0.05246804282069206, + "learning_rate": 2e-05, + "loss": 0.2153, + "step": 731 + }, + { + "epoch": 0.9410252289892335, + "grad_norm": 0.03938793018460274, + "learning_rate": 2e-05, + "loss": 0.1496, + "step": 732 + }, + { + "epoch": 0.9423107825807489, + "grad_norm": 0.05081872642040253, + "learning_rate": 2e-05, + "loss": 0.1939, + "step": 733 + }, + { + "epoch": 0.9435963361722641, + "grad_norm": 0.055075064301490784, + "learning_rate": 2e-05, + "loss": 0.2314, + "step": 734 + }, + { + "epoch": 0.9448818897637795, + "grad_norm": 0.057048946619033813, + "learning_rate": 2e-05, + "loss": 0.2258, + "step": 735 + }, + { + "epoch": 0.9461674433552949, + "grad_norm": 0.0564640611410141, + "learning_rate": 2e-05, + "loss": 0.221, + "step": 736 + }, + { + "epoch": 0.9474529969468102, + "grad_norm": 0.06246118247509003, + "learning_rate": 2e-05, + "loss": 0.2655, + "step": 737 + }, + { + "epoch": 0.9487385505383256, + "grad_norm": 0.06543996930122375, + "learning_rate": 2e-05, + "loss": 0.3487, + "step": 738 + }, + { + "epoch": 0.950024104129841, + "grad_norm": 0.05123418942093849, + "learning_rate": 2e-05, + "loss": 0.2593, + "step": 739 + }, + { + "epoch": 0.9513096577213562, + "grad_norm": 0.04761409014463425, + "learning_rate": 2e-05, + "loss": 0.1717, + "step": 740 + }, + { + "epoch": 0.9525952113128716, + "grad_norm": 0.05747079476714134, + "learning_rate": 2e-05, + "loss": 0.2239, + "step": 741 + }, + { + "epoch": 0.953880764904387, + "grad_norm": 0.04854227229952812, + "learning_rate": 2e-05, + "loss": 0.1742, + "step": 742 + }, + { + "epoch": 0.9551663184959023, + "grad_norm": 0.05784037709236145, + "learning_rate": 2e-05, + "loss": 0.203, + "step": 743 + }, + { + "epoch": 0.9564518720874177, + "grad_norm": 0.05370228737592697, + "learning_rate": 2e-05, + "loss": 0.255, + "step": 744 + }, + { + "epoch": 0.9577374256789329, + "grad_norm": 0.04535800218582153, + "learning_rate": 2e-05, + "loss": 0.1951, + "step": 745 + }, + { + "epoch": 0.9590229792704483, + "grad_norm": 0.044412512332201004, + "learning_rate": 2e-05, + "loss": 0.2087, + "step": 746 + }, + { + "epoch": 0.9603085328619637, + "grad_norm": 0.05077359825372696, + "learning_rate": 2e-05, + "loss": 0.19, + "step": 747 + }, + { + "epoch": 0.961594086453479, + "grad_norm": 0.056578539311885834, + "learning_rate": 2e-05, + "loss": 0.2784, + "step": 748 + }, + { + "epoch": 0.9628796400449944, + "grad_norm": 0.04252656549215317, + "learning_rate": 2e-05, + "loss": 0.239, + "step": 749 + }, + { + "epoch": 0.9641651936365098, + "grad_norm": 0.04754233360290527, + "learning_rate": 2e-05, + "loss": 0.1871, + "step": 750 + }, + { + "epoch": 0.965450747228025, + "grad_norm": 0.04948977380990982, + "learning_rate": 2e-05, + "loss": 0.2095, + "step": 751 + }, + { + "epoch": 0.9667363008195404, + "grad_norm": 0.056569986045360565, + "learning_rate": 2e-05, + "loss": 0.1627, + "step": 752 + }, + { + "epoch": 0.9680218544110558, + "grad_norm": 0.058012060821056366, + "learning_rate": 2e-05, + "loss": 0.277, + "step": 753 + }, + { + "epoch": 0.9693074080025711, + "grad_norm": 0.06445303559303284, + "learning_rate": 2e-05, + "loss": 0.3453, + "step": 754 + }, + { + "epoch": 0.9705929615940865, + "grad_norm": 0.04822942987084389, + "learning_rate": 2e-05, + "loss": 0.1958, + "step": 755 + }, + { + "epoch": 0.9718785151856018, + "grad_norm": 0.04951447993516922, + "learning_rate": 2e-05, + "loss": 0.2342, + "step": 756 + }, + { + "epoch": 0.9731640687771171, + "grad_norm": 0.04779404401779175, + "learning_rate": 2e-05, + "loss": 0.2277, + "step": 757 + }, + { + "epoch": 0.9744496223686325, + "grad_norm": 0.047998420894145966, + "learning_rate": 2e-05, + "loss": 0.1817, + "step": 758 + }, + { + "epoch": 0.9757351759601478, + "grad_norm": 0.050718434154987335, + "learning_rate": 2e-05, + "loss": 0.2289, + "step": 759 + }, + { + "epoch": 0.9770207295516632, + "grad_norm": 0.05427386984229088, + "learning_rate": 2e-05, + "loss": 0.2597, + "step": 760 + }, + { + "epoch": 0.9783062831431786, + "grad_norm": 0.06047537922859192, + "learning_rate": 2e-05, + "loss": 0.2597, + "step": 761 + }, + { + "epoch": 0.9795918367346939, + "grad_norm": 0.048412878066301346, + "learning_rate": 2e-05, + "loss": 0.2386, + "step": 762 + }, + { + "epoch": 0.9808773903262092, + "grad_norm": 0.04905233159661293, + "learning_rate": 2e-05, + "loss": 0.2239, + "step": 763 + }, + { + "epoch": 0.9821629439177245, + "grad_norm": 0.052379023283720016, + "learning_rate": 2e-05, + "loss": 0.263, + "step": 764 + }, + { + "epoch": 0.9834484975092399, + "grad_norm": 0.0489642396569252, + "learning_rate": 2e-05, + "loss": 0.225, + "step": 765 + }, + { + "epoch": 0.9847340511007553, + "grad_norm": 0.050984520465135574, + "learning_rate": 2e-05, + "loss": 0.219, + "step": 766 + }, + { + "epoch": 0.9860196046922706, + "grad_norm": 0.05487053468823433, + "learning_rate": 2e-05, + "loss": 0.1788, + "step": 767 + }, + { + "epoch": 0.987305158283786, + "grad_norm": 0.06488880515098572, + "learning_rate": 2e-05, + "loss": 0.2994, + "step": 768 + }, + { + "epoch": 0.9885907118753013, + "grad_norm": 0.057233408093452454, + "learning_rate": 2e-05, + "loss": 0.3028, + "step": 769 + }, + { + "epoch": 0.9898762654668166, + "grad_norm": 0.03885122016072273, + "learning_rate": 2e-05, + "loss": 0.1704, + "step": 770 + }, + { + "epoch": 0.991161819058332, + "grad_norm": 0.04395405203104019, + "learning_rate": 2e-05, + "loss": 0.168, + "step": 771 + }, + { + "epoch": 0.9924473726498474, + "grad_norm": 0.07156252861022949, + "learning_rate": 2e-05, + "loss": 0.3431, + "step": 772 + }, + { + "epoch": 0.9937329262413627, + "grad_norm": 0.05737178027629852, + "learning_rate": 2e-05, + "loss": 0.2595, + "step": 773 + }, + { + "epoch": 0.9950184798328781, + "grad_norm": 0.0596122108399868, + "learning_rate": 2e-05, + "loss": 0.2177, + "step": 774 + }, + { + "epoch": 0.9963040334243933, + "grad_norm": 0.0480956956744194, + "learning_rate": 2e-05, + "loss": 0.2008, + "step": 775 + }, + { + "epoch": 0.9975895870159087, + "grad_norm": 0.045857105404138565, + "learning_rate": 2e-05, + "loss": 0.2093, + "step": 776 + }, + { + "epoch": 0.9988751406074241, + "grad_norm": 0.05208531767129898, + "learning_rate": 2e-05, + "loss": 0.1512, + "step": 777 + }, + { + "epoch": 0.9988751406074241, + "step": 777, + "total_flos": 525522702336000.0, + "train_loss": 0.3197911096739186, + "train_runtime": 4432.7229, + "train_samples_per_second": 5.616, + "train_steps_per_second": 0.175 + } + ], + "logging_steps": 1.0, + "max_steps": 777, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100.0, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 525522702336000.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}