{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 200, "global_step": 1562, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0012804097311139564, "grad_norm": 0.5053460315076883, "learning_rate": 9.999989887057254e-06, "loss": 0.1728, "step": 1 }, { "epoch": 0.002560819462227913, "grad_norm": 0.5253416421561942, "learning_rate": 9.999959548269919e-06, "loss": 0.1815, "step": 2 }, { "epoch": 0.0038412291933418692, "grad_norm": 0.498571168917453, "learning_rate": 9.999908983760725e-06, "loss": 0.1939, "step": 3 }, { "epoch": 0.005121638924455826, "grad_norm": 0.4573411058358816, "learning_rate": 9.99983819373421e-06, "loss": 0.174, "step": 4 }, { "epoch": 0.006402048655569782, "grad_norm": 0.3781202135582942, "learning_rate": 9.999747178476736e-06, "loss": 0.1676, "step": 5 }, { "epoch": 0.0076824583866837385, "grad_norm": 0.3917683574358166, "learning_rate": 9.999635938356476e-06, "loss": 0.2018, "step": 6 }, { "epoch": 0.008962868117797696, "grad_norm": 0.35964786762972406, "learning_rate": 9.999504473823413e-06, "loss": 0.1687, "step": 7 }, { "epoch": 0.010243277848911651, "grad_norm": 0.3960869550476184, "learning_rate": 9.999352785409345e-06, "loss": 0.1759, "step": 8 }, { "epoch": 0.011523687580025609, "grad_norm": 0.3691209209170135, "learning_rate": 9.999180873727881e-06, "loss": 0.1457, "step": 9 }, { "epoch": 0.012804097311139564, "grad_norm": 0.3513368345700351, "learning_rate": 9.998988739474433e-06, "loss": 0.1659, "step": 10 }, { "epoch": 0.014084507042253521, "grad_norm": 0.3004889219309598, "learning_rate": 9.998776383426217e-06, "loss": 0.1507, "step": 11 }, { "epoch": 0.015364916773367477, "grad_norm": 0.31617635253255727, "learning_rate": 9.99854380644225e-06, "loss": 0.1588, "step": 12 }, { "epoch": 0.016645326504481434, "grad_norm": 0.2809924981506177, "learning_rate": 9.99829100946335e-06, "loss": 0.1465, "step": 13 }, { "epoch": 0.01792573623559539, "grad_norm": 0.3047251085576944, "learning_rate": 9.998017993512123e-06, "loss": 0.1559, "step": 14 }, { "epoch": 0.019206145966709345, "grad_norm": 0.31802856749859587, "learning_rate": 9.99772475969297e-06, "loss": 0.17, "step": 15 }, { "epoch": 0.020486555697823303, "grad_norm": 0.32215206459939755, "learning_rate": 9.99741130919207e-06, "loss": 0.1367, "step": 16 }, { "epoch": 0.02176696542893726, "grad_norm": 0.3160353255501763, "learning_rate": 9.997077643277388e-06, "loss": 0.1818, "step": 17 }, { "epoch": 0.023047375160051217, "grad_norm": 0.312380263506635, "learning_rate": 9.996723763298662e-06, "loss": 0.1425, "step": 18 }, { "epoch": 0.024327784891165175, "grad_norm": 0.2728155150098606, "learning_rate": 9.996349670687398e-06, "loss": 0.1443, "step": 19 }, { "epoch": 0.02560819462227913, "grad_norm": 0.3441023999218766, "learning_rate": 9.995955366956866e-06, "loss": 0.1238, "step": 20 }, { "epoch": 0.026888604353393086, "grad_norm": 0.24477199607892125, "learning_rate": 9.995540853702097e-06, "loss": 0.1348, "step": 21 }, { "epoch": 0.028169014084507043, "grad_norm": 0.21769738771616456, "learning_rate": 9.995106132599869e-06, "loss": 0.1148, "step": 22 }, { "epoch": 0.029449423815621, "grad_norm": 0.2520928288852759, "learning_rate": 9.994651205408705e-06, "loss": 0.1487, "step": 23 }, { "epoch": 0.030729833546734954, "grad_norm": 0.2392070804054465, "learning_rate": 9.99417607396887e-06, "loss": 0.1288, "step": 24 }, { "epoch": 0.03201024327784891, "grad_norm": 0.24545394712422589, "learning_rate": 9.993680740202349e-06, "loss": 0.1456, "step": 25 }, { "epoch": 0.03329065300896287, "grad_norm": 0.2284267156407845, "learning_rate": 9.993165206112857e-06, "loss": 0.1369, "step": 26 }, { "epoch": 0.034571062740076826, "grad_norm": 0.25244619443413246, "learning_rate": 9.992629473785825e-06, "loss": 0.1364, "step": 27 }, { "epoch": 0.03585147247119078, "grad_norm": 0.25805866098964997, "learning_rate": 9.992073545388379e-06, "loss": 0.1232, "step": 28 }, { "epoch": 0.03713188220230474, "grad_norm": 0.2921946489973403, "learning_rate": 9.991497423169352e-06, "loss": 0.1771, "step": 29 }, { "epoch": 0.03841229193341869, "grad_norm": 0.23266224832067517, "learning_rate": 9.990901109459258e-06, "loss": 0.1404, "step": 30 }, { "epoch": 0.03969270166453265, "grad_norm": 0.23372085575041607, "learning_rate": 9.990284606670294e-06, "loss": 0.125, "step": 31 }, { "epoch": 0.040973111395646605, "grad_norm": 0.23560174185240879, "learning_rate": 9.989647917296318e-06, "loss": 0.1451, "step": 32 }, { "epoch": 0.04225352112676056, "grad_norm": 0.214355752543102, "learning_rate": 9.988991043912857e-06, "loss": 0.1201, "step": 33 }, { "epoch": 0.04353393085787452, "grad_norm": 0.25564085790566204, "learning_rate": 9.988313989177076e-06, "loss": 0.1235, "step": 34 }, { "epoch": 0.04481434058898848, "grad_norm": 0.20338770940555145, "learning_rate": 9.987616755827785e-06, "loss": 0.1181, "step": 35 }, { "epoch": 0.046094750320102434, "grad_norm": 0.24223665248962686, "learning_rate": 9.986899346685413e-06, "loss": 0.1206, "step": 36 }, { "epoch": 0.04737516005121639, "grad_norm": 0.3280206187511514, "learning_rate": 9.98616176465201e-06, "loss": 0.1381, "step": 37 }, { "epoch": 0.04865556978233035, "grad_norm": 0.27977327916326145, "learning_rate": 9.985404012711222e-06, "loss": 0.1379, "step": 38 }, { "epoch": 0.0499359795134443, "grad_norm": 0.21920882965590394, "learning_rate": 9.984626093928295e-06, "loss": 0.1366, "step": 39 }, { "epoch": 0.05121638924455826, "grad_norm": 0.2447449401442065, "learning_rate": 9.983828011450045e-06, "loss": 0.144, "step": 40 }, { "epoch": 0.052496798975672214, "grad_norm": 0.22323032853945957, "learning_rate": 9.983009768504857e-06, "loss": 0.1275, "step": 41 }, { "epoch": 0.05377720870678617, "grad_norm": 0.23644064000162643, "learning_rate": 9.982171368402667e-06, "loss": 0.1438, "step": 42 }, { "epoch": 0.05505761843790013, "grad_norm": 0.24332093861017104, "learning_rate": 9.981312814534956e-06, "loss": 0.1138, "step": 43 }, { "epoch": 0.056338028169014086, "grad_norm": 0.2732111680294754, "learning_rate": 9.980434110374725e-06, "loss": 0.1106, "step": 44 }, { "epoch": 0.05761843790012804, "grad_norm": 0.21786457379356453, "learning_rate": 9.979535259476487e-06, "loss": 0.117, "step": 45 }, { "epoch": 0.058898847631242, "grad_norm": 0.2382562305629573, "learning_rate": 9.978616265476253e-06, "loss": 0.1089, "step": 46 }, { "epoch": 0.06017925736235595, "grad_norm": 0.22003286112109274, "learning_rate": 9.977677132091517e-06, "loss": 0.1147, "step": 47 }, { "epoch": 0.06145966709346991, "grad_norm": 0.2367022344212335, "learning_rate": 9.976717863121239e-06, "loss": 0.1104, "step": 48 }, { "epoch": 0.06274007682458387, "grad_norm": 0.21730491769623572, "learning_rate": 9.975738462445834e-06, "loss": 0.1185, "step": 49 }, { "epoch": 0.06402048655569782, "grad_norm": 0.24414133994886209, "learning_rate": 9.97473893402715e-06, "loss": 0.1442, "step": 50 }, { "epoch": 0.06530089628681178, "grad_norm": 0.22246384527562518, "learning_rate": 9.973719281908455e-06, "loss": 0.1012, "step": 51 }, { "epoch": 0.06658130601792574, "grad_norm": 0.2306650554509033, "learning_rate": 9.972679510214425e-06, "loss": 0.1376, "step": 52 }, { "epoch": 0.0678617157490397, "grad_norm": 0.20396378704905613, "learning_rate": 9.971619623151118e-06, "loss": 0.1198, "step": 53 }, { "epoch": 0.06914212548015365, "grad_norm": 0.584299779810075, "learning_rate": 9.970539625005966e-06, "loss": 0.1309, "step": 54 }, { "epoch": 0.07042253521126761, "grad_norm": 0.23049209168076623, "learning_rate": 9.969439520147754e-06, "loss": 0.11, "step": 55 }, { "epoch": 0.07170294494238157, "grad_norm": 0.31169465956741743, "learning_rate": 9.9683193130266e-06, "loss": 0.1215, "step": 56 }, { "epoch": 0.07298335467349552, "grad_norm": 0.19690804654538296, "learning_rate": 9.96717900817394e-06, "loss": 0.1098, "step": 57 }, { "epoch": 0.07426376440460948, "grad_norm": 0.27257820908146946, "learning_rate": 9.96601861020251e-06, "loss": 0.1214, "step": 58 }, { "epoch": 0.07554417413572344, "grad_norm": 0.22744323176717496, "learning_rate": 9.964838123806322e-06, "loss": 0.1239, "step": 59 }, { "epoch": 0.07682458386683738, "grad_norm": 0.1964901641357251, "learning_rate": 9.963637553760658e-06, "loss": 0.1028, "step": 60 }, { "epoch": 0.07810499359795134, "grad_norm": 0.26599941608546995, "learning_rate": 9.962416904922032e-06, "loss": 0.1246, "step": 61 }, { "epoch": 0.0793854033290653, "grad_norm": 0.2057507259678438, "learning_rate": 9.961176182228188e-06, "loss": 0.1214, "step": 62 }, { "epoch": 0.08066581306017925, "grad_norm": 0.2137602796891615, "learning_rate": 9.959915390698066e-06, "loss": 0.1182, "step": 63 }, { "epoch": 0.08194622279129321, "grad_norm": 0.19225657624086823, "learning_rate": 9.95863453543179e-06, "loss": 0.0996, "step": 64 }, { "epoch": 0.08322663252240717, "grad_norm": 0.20559487455057773, "learning_rate": 9.957333621610652e-06, "loss": 0.1087, "step": 65 }, { "epoch": 0.08450704225352113, "grad_norm": 0.20414411737819446, "learning_rate": 9.956012654497073e-06, "loss": 0.1143, "step": 66 }, { "epoch": 0.08578745198463508, "grad_norm": 0.25559410997091103, "learning_rate": 9.954671639434603e-06, "loss": 0.1235, "step": 67 }, { "epoch": 0.08706786171574904, "grad_norm": 0.24732533880497834, "learning_rate": 9.953310581847884e-06, "loss": 0.1236, "step": 68 }, { "epoch": 0.088348271446863, "grad_norm": 0.2692100842538577, "learning_rate": 9.951929487242635e-06, "loss": 0.1518, "step": 69 }, { "epoch": 0.08962868117797695, "grad_norm": 0.31038015450264617, "learning_rate": 9.950528361205627e-06, "loss": 0.1084, "step": 70 }, { "epoch": 0.09090909090909091, "grad_norm": 0.23304763056556263, "learning_rate": 9.949107209404664e-06, "loss": 0.1253, "step": 71 }, { "epoch": 0.09218950064020487, "grad_norm": 0.20835235677225467, "learning_rate": 9.947666037588557e-06, "loss": 0.1048, "step": 72 }, { "epoch": 0.09346991037131883, "grad_norm": 0.21571066464086827, "learning_rate": 9.946204851587102e-06, "loss": 0.116, "step": 73 }, { "epoch": 0.09475032010243278, "grad_norm": 0.2247746633436497, "learning_rate": 9.944723657311053e-06, "loss": 0.1346, "step": 74 }, { "epoch": 0.09603072983354674, "grad_norm": 0.23058319208645234, "learning_rate": 9.943222460752105e-06, "loss": 0.1157, "step": 75 }, { "epoch": 0.0973111395646607, "grad_norm": 0.22676162806455213, "learning_rate": 9.941701267982864e-06, "loss": 0.1064, "step": 76 }, { "epoch": 0.09859154929577464, "grad_norm": 0.21149432984801722, "learning_rate": 9.94016008515682e-06, "loss": 0.1165, "step": 77 }, { "epoch": 0.0998719590268886, "grad_norm": 0.3101573719118893, "learning_rate": 9.938598918508338e-06, "loss": 0.154, "step": 78 }, { "epoch": 0.10115236875800256, "grad_norm": 0.23491908115062252, "learning_rate": 9.937017774352606e-06, "loss": 0.1137, "step": 79 }, { "epoch": 0.10243277848911651, "grad_norm": 0.2028725473389837, "learning_rate": 9.935416659085639e-06, "loss": 0.104, "step": 80 }, { "epoch": 0.10371318822023047, "grad_norm": 0.21336330695697256, "learning_rate": 9.933795579184225e-06, "loss": 0.1122, "step": 81 }, { "epoch": 0.10499359795134443, "grad_norm": 0.203352860587917, "learning_rate": 9.932154541205925e-06, "loss": 0.0928, "step": 82 }, { "epoch": 0.10627400768245839, "grad_norm": 0.29283477122066165, "learning_rate": 9.930493551789024e-06, "loss": 0.107, "step": 83 }, { "epoch": 0.10755441741357234, "grad_norm": 0.21605775061614155, "learning_rate": 9.928812617652522e-06, "loss": 0.1198, "step": 84 }, { "epoch": 0.1088348271446863, "grad_norm": 0.25522801659680666, "learning_rate": 9.927111745596093e-06, "loss": 0.1089, "step": 85 }, { "epoch": 0.11011523687580026, "grad_norm": 0.20867628658824572, "learning_rate": 9.925390942500066e-06, "loss": 0.1065, "step": 86 }, { "epoch": 0.11139564660691421, "grad_norm": 0.24873358959814562, "learning_rate": 9.923650215325395e-06, "loss": 0.1192, "step": 87 }, { "epoch": 0.11267605633802817, "grad_norm": 0.2078743071517838, "learning_rate": 9.921889571113629e-06, "loss": 0.1107, "step": 88 }, { "epoch": 0.11395646606914213, "grad_norm": 0.2243597842114452, "learning_rate": 9.920109016986885e-06, "loss": 0.1142, "step": 89 }, { "epoch": 0.11523687580025609, "grad_norm": 0.23561488343094458, "learning_rate": 9.918308560147823e-06, "loss": 0.1138, "step": 90 }, { "epoch": 0.11651728553137004, "grad_norm": 0.20763089548111788, "learning_rate": 9.916488207879605e-06, "loss": 0.1149, "step": 91 }, { "epoch": 0.117797695262484, "grad_norm": 0.21205649526077106, "learning_rate": 9.914647967545881e-06, "loss": 0.1151, "step": 92 }, { "epoch": 0.11907810499359796, "grad_norm": 0.20711707368077145, "learning_rate": 9.91278784659075e-06, "loss": 0.1056, "step": 93 }, { "epoch": 0.1203585147247119, "grad_norm": 0.24167733547480985, "learning_rate": 9.910907852538729e-06, "loss": 0.1061, "step": 94 }, { "epoch": 0.12163892445582586, "grad_norm": 0.24038746916262685, "learning_rate": 9.909007992994726e-06, "loss": 0.1181, "step": 95 }, { "epoch": 0.12291933418693982, "grad_norm": 0.21114874400767178, "learning_rate": 9.907088275644012e-06, "loss": 0.1032, "step": 96 }, { "epoch": 0.12419974391805377, "grad_norm": 0.21182063039721938, "learning_rate": 9.905148708252183e-06, "loss": 0.1178, "step": 97 }, { "epoch": 0.12548015364916773, "grad_norm": 0.2780445704996857, "learning_rate": 9.90318929866513e-06, "loss": 0.1159, "step": 98 }, { "epoch": 0.1267605633802817, "grad_norm": 0.20982643452824534, "learning_rate": 9.901210054809015e-06, "loss": 0.1131, "step": 99 }, { "epoch": 0.12804097311139565, "grad_norm": 0.20736276164472214, "learning_rate": 9.899210984690229e-06, "loss": 0.0925, "step": 100 }, { "epoch": 0.1293213828425096, "grad_norm": 0.2271267961119714, "learning_rate": 9.897192096395362e-06, "loss": 0.1275, "step": 101 }, { "epoch": 0.13060179257362356, "grad_norm": 0.24608216671478705, "learning_rate": 9.89515339809118e-06, "loss": 0.1313, "step": 102 }, { "epoch": 0.13188220230473752, "grad_norm": 0.21510345619070287, "learning_rate": 9.893094898024573e-06, "loss": 0.1028, "step": 103 }, { "epoch": 0.13316261203585147, "grad_norm": 0.21697317036379377, "learning_rate": 9.891016604522543e-06, "loss": 0.1171, "step": 104 }, { "epoch": 0.13444302176696543, "grad_norm": 0.2136638725986537, "learning_rate": 9.888918525992153e-06, "loss": 0.1025, "step": 105 }, { "epoch": 0.1357234314980794, "grad_norm": 0.21306338645168824, "learning_rate": 9.886800670920503e-06, "loss": 0.1076, "step": 106 }, { "epoch": 0.13700384122919335, "grad_norm": 0.2031709342472571, "learning_rate": 9.88466304787469e-06, "loss": 0.1087, "step": 107 }, { "epoch": 0.1382842509603073, "grad_norm": 0.21923816159927167, "learning_rate": 9.882505665501778e-06, "loss": 0.1144, "step": 108 }, { "epoch": 0.13956466069142126, "grad_norm": 0.33196299693234177, "learning_rate": 9.880328532528764e-06, "loss": 0.1012, "step": 109 }, { "epoch": 0.14084507042253522, "grad_norm": 0.21529402955174173, "learning_rate": 9.878131657762535e-06, "loss": 0.1124, "step": 110 }, { "epoch": 0.14212548015364918, "grad_norm": 0.22281968953230835, "learning_rate": 9.875915050089836e-06, "loss": 0.1163, "step": 111 }, { "epoch": 0.14340588988476313, "grad_norm": 0.24024392849418508, "learning_rate": 9.87367871847724e-06, "loss": 0.0931, "step": 112 }, { "epoch": 0.1446862996158771, "grad_norm": 0.22588630575782578, "learning_rate": 9.871422671971105e-06, "loss": 0.1184, "step": 113 }, { "epoch": 0.14596670934699105, "grad_norm": 0.24656210706301043, "learning_rate": 9.869146919697536e-06, "loss": 0.1023, "step": 114 }, { "epoch": 0.147247119078105, "grad_norm": 0.2261727252048021, "learning_rate": 9.866851470862356e-06, "loss": 0.1258, "step": 115 }, { "epoch": 0.14852752880921896, "grad_norm": 0.2307271533489196, "learning_rate": 9.864536334751063e-06, "loss": 0.1092, "step": 116 }, { "epoch": 0.14980793854033292, "grad_norm": 0.2136173663715103, "learning_rate": 9.86220152072879e-06, "loss": 0.1144, "step": 117 }, { "epoch": 0.15108834827144688, "grad_norm": 0.2013183549217224, "learning_rate": 9.859847038240273e-06, "loss": 0.102, "step": 118 }, { "epoch": 0.1523687580025608, "grad_norm": 0.2580811513755183, "learning_rate": 9.857472896809814e-06, "loss": 0.0934, "step": 119 }, { "epoch": 0.15364916773367476, "grad_norm": 0.24648769944703175, "learning_rate": 9.855079106041233e-06, "loss": 0.1135, "step": 120 }, { "epoch": 0.15492957746478872, "grad_norm": 0.22166665439331903, "learning_rate": 9.852665675617837e-06, "loss": 0.1075, "step": 121 }, { "epoch": 0.15620998719590268, "grad_norm": 0.215764121524641, "learning_rate": 9.850232615302382e-06, "loss": 0.1178, "step": 122 }, { "epoch": 0.15749039692701663, "grad_norm": 0.23782584512766114, "learning_rate": 9.847779934937027e-06, "loss": 0.1036, "step": 123 }, { "epoch": 0.1587708066581306, "grad_norm": 0.2673826033445351, "learning_rate": 9.845307644443296e-06, "loss": 0.1153, "step": 124 }, { "epoch": 0.16005121638924455, "grad_norm": 0.22844139622499607, "learning_rate": 9.842815753822045e-06, "loss": 0.1011, "step": 125 }, { "epoch": 0.1613316261203585, "grad_norm": 0.2616971946673868, "learning_rate": 9.840304273153411e-06, "loss": 0.1127, "step": 126 }, { "epoch": 0.16261203585147246, "grad_norm": 0.23199850141796458, "learning_rate": 9.83777321259678e-06, "loss": 0.1195, "step": 127 }, { "epoch": 0.16389244558258642, "grad_norm": 1.0678055933226287, "learning_rate": 9.835222582390737e-06, "loss": 0.1094, "step": 128 }, { "epoch": 0.16517285531370038, "grad_norm": 0.22203807884974602, "learning_rate": 9.832652392853038e-06, "loss": 0.1224, "step": 129 }, { "epoch": 0.16645326504481434, "grad_norm": 0.20676536354490951, "learning_rate": 9.830062654380549e-06, "loss": 0.0957, "step": 130 }, { "epoch": 0.1677336747759283, "grad_norm": 0.295531810100396, "learning_rate": 9.827453377449225e-06, "loss": 0.122, "step": 131 }, { "epoch": 0.16901408450704225, "grad_norm": 0.21278086377815433, "learning_rate": 9.82482457261405e-06, "loss": 0.0941, "step": 132 }, { "epoch": 0.1702944942381562, "grad_norm": 0.21277758711733244, "learning_rate": 9.822176250509008e-06, "loss": 0.1054, "step": 133 }, { "epoch": 0.17157490396927016, "grad_norm": 0.19446151549654608, "learning_rate": 9.819508421847031e-06, "loss": 0.0971, "step": 134 }, { "epoch": 0.17285531370038412, "grad_norm": 0.2752181032376664, "learning_rate": 9.816821097419956e-06, "loss": 0.1099, "step": 135 }, { "epoch": 0.17413572343149808, "grad_norm": 0.2287536404697376, "learning_rate": 9.814114288098487e-06, "loss": 0.0972, "step": 136 }, { "epoch": 0.17541613316261204, "grad_norm": 0.2235448180551025, "learning_rate": 9.811388004832148e-06, "loss": 0.1236, "step": 137 }, { "epoch": 0.176696542893726, "grad_norm": 0.24978166796775542, "learning_rate": 9.808642258649238e-06, "loss": 0.1086, "step": 138 }, { "epoch": 0.17797695262483995, "grad_norm": 0.38275085108728213, "learning_rate": 9.805877060656786e-06, "loss": 0.1023, "step": 139 }, { "epoch": 0.1792573623559539, "grad_norm": 0.27811802323964147, "learning_rate": 9.803092422040506e-06, "loss": 0.1162, "step": 140 }, { "epoch": 0.18053777208706787, "grad_norm": 0.36534362150249305, "learning_rate": 9.800288354064756e-06, "loss": 0.1095, "step": 141 }, { "epoch": 0.18181818181818182, "grad_norm": 0.20486068837249038, "learning_rate": 9.797464868072489e-06, "loss": 0.1026, "step": 142 }, { "epoch": 0.18309859154929578, "grad_norm": 0.200010756521382, "learning_rate": 9.7946219754852e-06, "loss": 0.0986, "step": 143 }, { "epoch": 0.18437900128040974, "grad_norm": 0.204202191883862, "learning_rate": 9.791759687802903e-06, "loss": 0.1024, "step": 144 }, { "epoch": 0.1856594110115237, "grad_norm": 0.20843143519539428, "learning_rate": 9.788878016604048e-06, "loss": 0.099, "step": 145 }, { "epoch": 0.18693982074263765, "grad_norm": 0.225654154084788, "learning_rate": 9.785976973545512e-06, "loss": 0.134, "step": 146 }, { "epoch": 0.1882202304737516, "grad_norm": 0.232596129227796, "learning_rate": 9.783056570362526e-06, "loss": 0.0969, "step": 147 }, { "epoch": 0.18950064020486557, "grad_norm": 0.296716353066149, "learning_rate": 9.780116818868636e-06, "loss": 0.1064, "step": 148 }, { "epoch": 0.19078104993597952, "grad_norm": 0.20661298912483694, "learning_rate": 9.777157730955661e-06, "loss": 0.1, "step": 149 }, { "epoch": 0.19206145966709348, "grad_norm": 0.2428950315394032, "learning_rate": 9.774179318593635e-06, "loss": 0.1138, "step": 150 }, { "epoch": 0.19334186939820744, "grad_norm": 0.22240054828096548, "learning_rate": 9.771181593830762e-06, "loss": 0.1005, "step": 151 }, { "epoch": 0.1946222791293214, "grad_norm": 0.3104127220940388, "learning_rate": 9.768164568793372e-06, "loss": 0.1115, "step": 152 }, { "epoch": 0.19590268886043533, "grad_norm": 0.2216478703019566, "learning_rate": 9.76512825568586e-06, "loss": 0.115, "step": 153 }, { "epoch": 0.19718309859154928, "grad_norm": 0.39255199244531097, "learning_rate": 9.762072666790658e-06, "loss": 0.1144, "step": 154 }, { "epoch": 0.19846350832266324, "grad_norm": 0.20246051256828537, "learning_rate": 9.758997814468158e-06, "loss": 0.0956, "step": 155 }, { "epoch": 0.1997439180537772, "grad_norm": 0.23956683583178173, "learning_rate": 9.755903711156685e-06, "loss": 0.1114, "step": 156 }, { "epoch": 0.20102432778489115, "grad_norm": 0.2412902245649482, "learning_rate": 9.752790369372434e-06, "loss": 0.0897, "step": 157 }, { "epoch": 0.2023047375160051, "grad_norm": 0.222997611933498, "learning_rate": 9.749657801709425e-06, "loss": 0.1089, "step": 158 }, { "epoch": 0.20358514724711907, "grad_norm": 0.21502706306044234, "learning_rate": 9.746506020839449e-06, "loss": 0.1144, "step": 159 }, { "epoch": 0.20486555697823303, "grad_norm": 0.21811589131400422, "learning_rate": 9.743335039512015e-06, "loss": 0.1073, "step": 160 }, { "epoch": 0.20614596670934698, "grad_norm": 0.24460194607801816, "learning_rate": 9.740144870554306e-06, "loss": 0.1099, "step": 161 }, { "epoch": 0.20742637644046094, "grad_norm": 0.22537032282752506, "learning_rate": 9.736935526871121e-06, "loss": 0.1115, "step": 162 }, { "epoch": 0.2087067861715749, "grad_norm": 0.22015411149105688, "learning_rate": 9.733707021444823e-06, "loss": 0.1011, "step": 163 }, { "epoch": 0.20998719590268886, "grad_norm": 0.24214159992272408, "learning_rate": 9.730459367335288e-06, "loss": 0.135, "step": 164 }, { "epoch": 0.2112676056338028, "grad_norm": 0.2360446999936293, "learning_rate": 9.727192577679852e-06, "loss": 0.1063, "step": 165 }, { "epoch": 0.21254801536491677, "grad_norm": 0.24421675091024792, "learning_rate": 9.72390666569326e-06, "loss": 0.121, "step": 166 }, { "epoch": 0.21382842509603073, "grad_norm": 0.202683696520402, "learning_rate": 9.720601644667604e-06, "loss": 0.1034, "step": 167 }, { "epoch": 0.21510883482714468, "grad_norm": 0.26128937041633765, "learning_rate": 9.717277527972282e-06, "loss": 0.1181, "step": 168 }, { "epoch": 0.21638924455825864, "grad_norm": 0.24843747701109925, "learning_rate": 9.713934329053933e-06, "loss": 0.0941, "step": 169 }, { "epoch": 0.2176696542893726, "grad_norm": 0.2090188314645648, "learning_rate": 9.710572061436389e-06, "loss": 0.1045, "step": 170 }, { "epoch": 0.21895006402048656, "grad_norm": 0.29614550998615535, "learning_rate": 9.70719073872062e-06, "loss": 0.1193, "step": 171 }, { "epoch": 0.22023047375160051, "grad_norm": 0.21984198302472788, "learning_rate": 9.703790374584674e-06, "loss": 0.1008, "step": 172 }, { "epoch": 0.22151088348271447, "grad_norm": 0.2423523289973834, "learning_rate": 9.700370982783625e-06, "loss": 0.1262, "step": 173 }, { "epoch": 0.22279129321382843, "grad_norm": 0.2865608002517382, "learning_rate": 9.69693257714952e-06, "loss": 0.1087, "step": 174 }, { "epoch": 0.22407170294494239, "grad_norm": 0.2231141085616036, "learning_rate": 9.693475171591319e-06, "loss": 0.1216, "step": 175 }, { "epoch": 0.22535211267605634, "grad_norm": 0.2193149401830866, "learning_rate": 9.689998780094839e-06, "loss": 0.1083, "step": 176 }, { "epoch": 0.2266325224071703, "grad_norm": 0.22914519892164967, "learning_rate": 9.686503416722696e-06, "loss": 0.1069, "step": 177 }, { "epoch": 0.22791293213828426, "grad_norm": 0.29996072066588375, "learning_rate": 9.682989095614262e-06, "loss": 0.0958, "step": 178 }, { "epoch": 0.22919334186939821, "grad_norm": 0.25182470555841974, "learning_rate": 9.679455830985579e-06, "loss": 0.1219, "step": 179 }, { "epoch": 0.23047375160051217, "grad_norm": 0.2602697962253673, "learning_rate": 9.675903637129333e-06, "loss": 0.0975, "step": 180 }, { "epoch": 0.23175416133162613, "grad_norm": 0.2208597266931842, "learning_rate": 9.672332528414778e-06, "loss": 0.0987, "step": 181 }, { "epoch": 0.2330345710627401, "grad_norm": 0.2139651795292278, "learning_rate": 9.668742519287681e-06, "loss": 0.1164, "step": 182 }, { "epoch": 0.23431498079385404, "grad_norm": 0.2809120923323904, "learning_rate": 9.665133624270262e-06, "loss": 0.1283, "step": 183 }, { "epoch": 0.235595390524968, "grad_norm": 0.19639324489086046, "learning_rate": 9.661505857961142e-06, "loss": 0.0887, "step": 184 }, { "epoch": 0.23687580025608196, "grad_norm": 0.24202821256919488, "learning_rate": 9.657859235035279e-06, "loss": 0.1073, "step": 185 }, { "epoch": 0.23815620998719592, "grad_norm": 0.2177523646626564, "learning_rate": 9.654193770243907e-06, "loss": 0.1176, "step": 186 }, { "epoch": 0.23943661971830985, "grad_norm": 0.246177290984774, "learning_rate": 9.650509478414483e-06, "loss": 0.103, "step": 187 }, { "epoch": 0.2407170294494238, "grad_norm": 0.21329988203418188, "learning_rate": 9.646806374450615e-06, "loss": 0.1032, "step": 188 }, { "epoch": 0.24199743918053776, "grad_norm": 0.2280300220554783, "learning_rate": 9.643084473332018e-06, "loss": 0.0961, "step": 189 }, { "epoch": 0.24327784891165172, "grad_norm": 0.22747971311100154, "learning_rate": 9.639343790114443e-06, "loss": 0.1335, "step": 190 }, { "epoch": 0.24455825864276567, "grad_norm": 0.22435352707325737, "learning_rate": 9.635584339929612e-06, "loss": 0.1056, "step": 191 }, { "epoch": 0.24583866837387963, "grad_norm": 0.24333298818766996, "learning_rate": 9.631806137985167e-06, "loss": 0.1081, "step": 192 }, { "epoch": 0.2471190781049936, "grad_norm": 0.2195029746382595, "learning_rate": 9.628009199564608e-06, "loss": 0.1119, "step": 193 }, { "epoch": 0.24839948783610755, "grad_norm": 0.22689811326381693, "learning_rate": 9.624193540027219e-06, "loss": 0.0876, "step": 194 }, { "epoch": 0.2496798975672215, "grad_norm": 0.19048696785926153, "learning_rate": 9.62035917480802e-06, "loss": 0.0801, "step": 195 }, { "epoch": 0.25096030729833546, "grad_norm": 0.18997919368119753, "learning_rate": 9.616506119417698e-06, "loss": 0.0864, "step": 196 }, { "epoch": 0.25224071702944945, "grad_norm": 0.21753847747811403, "learning_rate": 9.612634389442545e-06, "loss": 0.1013, "step": 197 }, { "epoch": 0.2535211267605634, "grad_norm": 0.208334388717001, "learning_rate": 9.608744000544392e-06, "loss": 0.0969, "step": 198 }, { "epoch": 0.25480153649167736, "grad_norm": 0.2517511056933105, "learning_rate": 9.604834968460554e-06, "loss": 0.1226, "step": 199 }, { "epoch": 0.2560819462227913, "grad_norm": 0.24662823616934423, "learning_rate": 9.600907309003756e-06, "loss": 0.1003, "step": 200 }, { "epoch": 0.2560819462227913, "eval_loss": 0.10295484215021133, "eval_runtime": 10.9578, "eval_samples_per_second": 23.089, "eval_steps_per_second": 5.841, "step": 200 }, { "epoch": 0.2573623559539053, "grad_norm": 0.19800989525163418, "learning_rate": 9.596961038062077e-06, "loss": 0.09, "step": 201 }, { "epoch": 0.2586427656850192, "grad_norm": 0.19429372471818235, "learning_rate": 9.592996171598882e-06, "loss": 0.089, "step": 202 }, { "epoch": 0.2599231754161332, "grad_norm": 0.20721770437541243, "learning_rate": 9.589012725652757e-06, "loss": 0.0973, "step": 203 }, { "epoch": 0.2612035851472471, "grad_norm": 0.21086016882025133, "learning_rate": 9.585010716337447e-06, "loss": 0.0969, "step": 204 }, { "epoch": 0.26248399487836105, "grad_norm": 0.2014157078120381, "learning_rate": 9.580990159841788e-06, "loss": 0.0849, "step": 205 }, { "epoch": 0.26376440460947503, "grad_norm": 0.28287727596695916, "learning_rate": 9.576951072429644e-06, "loss": 0.1029, "step": 206 }, { "epoch": 0.26504481434058896, "grad_norm": 0.2099040784395317, "learning_rate": 9.57289347043984e-06, "loss": 0.0997, "step": 207 }, { "epoch": 0.26632522407170295, "grad_norm": 0.21273331739730908, "learning_rate": 9.56881737028609e-06, "loss": 0.0926, "step": 208 }, { "epoch": 0.2676056338028169, "grad_norm": 0.26648185760001897, "learning_rate": 9.564722788456943e-06, "loss": 0.0934, "step": 209 }, { "epoch": 0.26888604353393086, "grad_norm": 0.2974403789598187, "learning_rate": 9.560609741515711e-06, "loss": 0.1093, "step": 210 }, { "epoch": 0.2701664532650448, "grad_norm": 0.21511245367067508, "learning_rate": 9.556478246100395e-06, "loss": 0.1001, "step": 211 }, { "epoch": 0.2714468629961588, "grad_norm": 0.42928420784487775, "learning_rate": 9.552328318923626e-06, "loss": 0.0965, "step": 212 }, { "epoch": 0.2727272727272727, "grad_norm": 0.2214995856589624, "learning_rate": 9.548159976772593e-06, "loss": 0.1072, "step": 213 }, { "epoch": 0.2740076824583867, "grad_norm": 0.23987291695012447, "learning_rate": 9.543973236508978e-06, "loss": 0.1298, "step": 214 }, { "epoch": 0.2752880921895006, "grad_norm": 1.0611216323316597, "learning_rate": 9.539768115068891e-06, "loss": 0.1299, "step": 215 }, { "epoch": 0.2765685019206146, "grad_norm": 0.20378285734342158, "learning_rate": 9.535544629462788e-06, "loss": 0.096, "step": 216 }, { "epoch": 0.27784891165172854, "grad_norm": 0.2205019465799966, "learning_rate": 9.531302796775416e-06, "loss": 0.1164, "step": 217 }, { "epoch": 0.2791293213828425, "grad_norm": 0.22444546803514145, "learning_rate": 9.527042634165745e-06, "loss": 0.1037, "step": 218 }, { "epoch": 0.28040973111395645, "grad_norm": 0.20343271038310107, "learning_rate": 9.522764158866882e-06, "loss": 0.1053, "step": 219 }, { "epoch": 0.28169014084507044, "grad_norm": 0.26572811709690564, "learning_rate": 9.51846738818602e-06, "loss": 0.0872, "step": 220 }, { "epoch": 0.28297055057618437, "grad_norm": 0.5077960176056625, "learning_rate": 9.514152339504356e-06, "loss": 0.1023, "step": 221 }, { "epoch": 0.28425096030729835, "grad_norm": 0.2282726059991323, "learning_rate": 9.509819030277027e-06, "loss": 0.0928, "step": 222 }, { "epoch": 0.2855313700384123, "grad_norm": 0.21209173117577554, "learning_rate": 9.505467478033036e-06, "loss": 0.1095, "step": 223 }, { "epoch": 0.28681177976952626, "grad_norm": 0.21435324933922675, "learning_rate": 9.50109770037518e-06, "loss": 0.0931, "step": 224 }, { "epoch": 0.2880921895006402, "grad_norm": 0.19786324229716037, "learning_rate": 9.49670971497999e-06, "loss": 0.0823, "step": 225 }, { "epoch": 0.2893725992317542, "grad_norm": 0.21254211994548683, "learning_rate": 9.492303539597637e-06, "loss": 0.0982, "step": 226 }, { "epoch": 0.2906530089628681, "grad_norm": 0.2308326775027705, "learning_rate": 9.487879192051885e-06, "loss": 0.119, "step": 227 }, { "epoch": 0.2919334186939821, "grad_norm": 0.23316946139937447, "learning_rate": 9.483436690240001e-06, "loss": 0.117, "step": 228 }, { "epoch": 0.293213828425096, "grad_norm": 0.22482406644037425, "learning_rate": 9.478976052132694e-06, "loss": 0.1007, "step": 229 }, { "epoch": 0.29449423815621, "grad_norm": 0.2269661661047752, "learning_rate": 9.474497295774031e-06, "loss": 0.1195, "step": 230 }, { "epoch": 0.29577464788732394, "grad_norm": 0.25601193342589307, "learning_rate": 9.470000439281379e-06, "loss": 0.0934, "step": 231 }, { "epoch": 0.2970550576184379, "grad_norm": 0.3501072294362615, "learning_rate": 9.465485500845317e-06, "loss": 0.1106, "step": 232 }, { "epoch": 0.29833546734955185, "grad_norm": 0.20063532308699464, "learning_rate": 9.460952498729572e-06, "loss": 0.1022, "step": 233 }, { "epoch": 0.29961587708066584, "grad_norm": 0.2564752547786142, "learning_rate": 9.456401451270937e-06, "loss": 0.0949, "step": 234 }, { "epoch": 0.30089628681177977, "grad_norm": 0.23331704506950626, "learning_rate": 9.451832376879208e-06, "loss": 0.1076, "step": 235 }, { "epoch": 0.30217669654289375, "grad_norm": 0.2046526398070333, "learning_rate": 9.447245294037101e-06, "loss": 0.0875, "step": 236 }, { "epoch": 0.3034571062740077, "grad_norm": 0.23245382089489702, "learning_rate": 9.442640221300174e-06, "loss": 0.0977, "step": 237 }, { "epoch": 0.3047375160051216, "grad_norm": 0.2967919985063825, "learning_rate": 9.438017177296767e-06, "loss": 0.1124, "step": 238 }, { "epoch": 0.3060179257362356, "grad_norm": 0.21722091708514227, "learning_rate": 9.433376180727906e-06, "loss": 0.0915, "step": 239 }, { "epoch": 0.3072983354673495, "grad_norm": 0.22288670643268046, "learning_rate": 9.42871725036725e-06, "loss": 0.0968, "step": 240 }, { "epoch": 0.3085787451984635, "grad_norm": 0.21909370074097287, "learning_rate": 9.424040405060994e-06, "loss": 0.0919, "step": 241 }, { "epoch": 0.30985915492957744, "grad_norm": 0.24123205359195735, "learning_rate": 9.419345663727805e-06, "loss": 0.089, "step": 242 }, { "epoch": 0.3111395646606914, "grad_norm": 0.2243974339932462, "learning_rate": 9.414633045358746e-06, "loss": 0.1037, "step": 243 }, { "epoch": 0.31241997439180536, "grad_norm": 0.23872148775876528, "learning_rate": 9.409902569017191e-06, "loss": 0.1043, "step": 244 }, { "epoch": 0.31370038412291934, "grad_norm": 0.21320137468093628, "learning_rate": 9.405154253838754e-06, "loss": 0.1006, "step": 245 }, { "epoch": 0.31498079385403327, "grad_norm": 0.2107815636744153, "learning_rate": 9.400388119031212e-06, "loss": 0.0934, "step": 246 }, { "epoch": 0.31626120358514725, "grad_norm": 0.22210836366388206, "learning_rate": 9.395604183874423e-06, "loss": 0.125, "step": 247 }, { "epoch": 0.3175416133162612, "grad_norm": 0.22242071498830648, "learning_rate": 9.390802467720256e-06, "loss": 0.0874, "step": 248 }, { "epoch": 0.31882202304737517, "grad_norm": 0.19599675247854556, "learning_rate": 9.385982989992495e-06, "loss": 0.08, "step": 249 }, { "epoch": 0.3201024327784891, "grad_norm": 0.22321459692766651, "learning_rate": 9.38114577018679e-06, "loss": 0.0975, "step": 250 }, { "epoch": 0.3213828425096031, "grad_norm": 0.20191960851059343, "learning_rate": 9.376290827870546e-06, "loss": 0.0896, "step": 251 }, { "epoch": 0.322663252240717, "grad_norm": 0.25999072344629115, "learning_rate": 9.371418182682866e-06, "loss": 0.1124, "step": 252 }, { "epoch": 0.323943661971831, "grad_norm": 0.22345061884667541, "learning_rate": 9.366527854334464e-06, "loss": 0.1033, "step": 253 }, { "epoch": 0.32522407170294493, "grad_norm": 0.19672193430862964, "learning_rate": 9.361619862607583e-06, "loss": 0.0889, "step": 254 }, { "epoch": 0.3265044814340589, "grad_norm": 0.22366952990835842, "learning_rate": 9.35669422735592e-06, "loss": 0.1106, "step": 255 }, { "epoch": 0.32778489116517284, "grad_norm": 0.21640061980887773, "learning_rate": 9.35175096850454e-06, "loss": 0.1121, "step": 256 }, { "epoch": 0.3290653008962868, "grad_norm": 0.22141421327858304, "learning_rate": 9.346790106049802e-06, "loss": 0.1085, "step": 257 }, { "epoch": 0.33034571062740076, "grad_norm": 0.23676730891720585, "learning_rate": 9.341811660059272e-06, "loss": 0.1054, "step": 258 }, { "epoch": 0.33162612035851474, "grad_norm": 0.21207819439717576, "learning_rate": 9.336815650671646e-06, "loss": 0.1046, "step": 259 }, { "epoch": 0.33290653008962867, "grad_norm": 0.24124877022847982, "learning_rate": 9.331802098096668e-06, "loss": 0.1145, "step": 260 }, { "epoch": 0.33418693982074266, "grad_norm": 0.2235922430636218, "learning_rate": 9.326771022615044e-06, "loss": 0.1013, "step": 261 }, { "epoch": 0.3354673495518566, "grad_norm": 0.24896557376262207, "learning_rate": 9.321722444578367e-06, "loss": 0.1061, "step": 262 }, { "epoch": 0.33674775928297057, "grad_norm": 0.21936170117279488, "learning_rate": 9.316656384409028e-06, "loss": 0.1078, "step": 263 }, { "epoch": 0.3380281690140845, "grad_norm": 0.21173184196076902, "learning_rate": 9.31157286260014e-06, "loss": 0.0929, "step": 264 }, { "epoch": 0.3393085787451985, "grad_norm": 0.2198310744059307, "learning_rate": 9.306471899715446e-06, "loss": 0.0991, "step": 265 }, { "epoch": 0.3405889884763124, "grad_norm": 0.24332344656702928, "learning_rate": 9.301353516389247e-06, "loss": 0.1291, "step": 266 }, { "epoch": 0.3418693982074264, "grad_norm": 0.2637093029075382, "learning_rate": 9.296217733326307e-06, "loss": 0.1039, "step": 267 }, { "epoch": 0.34314980793854033, "grad_norm": 0.22119078695696195, "learning_rate": 9.29106457130178e-06, "loss": 0.1165, "step": 268 }, { "epoch": 0.3444302176696543, "grad_norm": 0.2332100603615671, "learning_rate": 9.28589405116112e-06, "loss": 0.0994, "step": 269 }, { "epoch": 0.34571062740076824, "grad_norm": 0.21992802430418248, "learning_rate": 9.280706193819992e-06, "loss": 0.1032, "step": 270 }, { "epoch": 0.34699103713188223, "grad_norm": 0.19789439453537955, "learning_rate": 9.275501020264203e-06, "loss": 0.0797, "step": 271 }, { "epoch": 0.34827144686299616, "grad_norm": 0.22967961008226068, "learning_rate": 9.270278551549601e-06, "loss": 0.0926, "step": 272 }, { "epoch": 0.3495518565941101, "grad_norm": 0.2093806040827184, "learning_rate": 9.265038808801994e-06, "loss": 0.1015, "step": 273 }, { "epoch": 0.3508322663252241, "grad_norm": 0.21919188806421425, "learning_rate": 9.259781813217072e-06, "loss": 0.0975, "step": 274 }, { "epoch": 0.352112676056338, "grad_norm": 0.2085159299832725, "learning_rate": 9.25450758606031e-06, "loss": 0.0928, "step": 275 }, { "epoch": 0.353393085787452, "grad_norm": 0.2123207338157853, "learning_rate": 9.249216148666896e-06, "loss": 0.1034, "step": 276 }, { "epoch": 0.3546734955185659, "grad_norm": 0.2131101395086534, "learning_rate": 9.243907522441628e-06, "loss": 0.1014, "step": 277 }, { "epoch": 0.3559539052496799, "grad_norm": 0.23767263562570912, "learning_rate": 9.238581728858839e-06, "loss": 0.1137, "step": 278 }, { "epoch": 0.35723431498079383, "grad_norm": 0.23871357410469324, "learning_rate": 9.233238789462309e-06, "loss": 0.1048, "step": 279 }, { "epoch": 0.3585147247119078, "grad_norm": 0.21106416102591846, "learning_rate": 9.227878725865172e-06, "loss": 0.0879, "step": 280 }, { "epoch": 0.35979513444302175, "grad_norm": 0.22882796476030698, "learning_rate": 9.222501559749834e-06, "loss": 0.0828, "step": 281 }, { "epoch": 0.36107554417413573, "grad_norm": 0.22355168752170032, "learning_rate": 9.217107312867888e-06, "loss": 0.098, "step": 282 }, { "epoch": 0.36235595390524966, "grad_norm": 0.20531357685119725, "learning_rate": 9.211696007040015e-06, "loss": 0.0949, "step": 283 }, { "epoch": 0.36363636363636365, "grad_norm": 0.21953993366034744, "learning_rate": 9.206267664155906e-06, "loss": 0.0994, "step": 284 }, { "epoch": 0.3649167733674776, "grad_norm": 0.21577945470687593, "learning_rate": 9.20082230617417e-06, "loss": 0.1077, "step": 285 }, { "epoch": 0.36619718309859156, "grad_norm": 0.2025247747235751, "learning_rate": 9.195359955122244e-06, "loss": 0.096, "step": 286 }, { "epoch": 0.3674775928297055, "grad_norm": 0.2688790973634157, "learning_rate": 9.189880633096305e-06, "loss": 0.117, "step": 287 }, { "epoch": 0.3687580025608195, "grad_norm": 0.21428943404693138, "learning_rate": 9.184384362261181e-06, "loss": 0.1071, "step": 288 }, { "epoch": 0.3700384122919334, "grad_norm": 0.21974991360129034, "learning_rate": 9.178871164850262e-06, "loss": 0.1114, "step": 289 }, { "epoch": 0.3713188220230474, "grad_norm": 0.23199913282226747, "learning_rate": 9.173341063165406e-06, "loss": 0.1075, "step": 290 }, { "epoch": 0.3725992317541613, "grad_norm": 0.23179766384906195, "learning_rate": 9.167794079576856e-06, "loss": 0.1231, "step": 291 }, { "epoch": 0.3738796414852753, "grad_norm": 0.1983550021125703, "learning_rate": 9.16223023652314e-06, "loss": 0.0841, "step": 292 }, { "epoch": 0.37516005121638923, "grad_norm": 0.18806986905332912, "learning_rate": 9.15664955651099e-06, "loss": 0.0769, "step": 293 }, { "epoch": 0.3764404609475032, "grad_norm": 0.2428580418781726, "learning_rate": 9.151052062115247e-06, "loss": 0.1088, "step": 294 }, { "epoch": 0.37772087067861715, "grad_norm": 0.21366397330744122, "learning_rate": 9.145437775978765e-06, "loss": 0.0991, "step": 295 }, { "epoch": 0.37900128040973113, "grad_norm": 0.20506536940067413, "learning_rate": 9.139806720812326e-06, "loss": 0.092, "step": 296 }, { "epoch": 0.38028169014084506, "grad_norm": 0.2215164765821936, "learning_rate": 9.134158919394545e-06, "loss": 0.0923, "step": 297 }, { "epoch": 0.38156209987195905, "grad_norm": 0.23021684961464534, "learning_rate": 9.128494394571778e-06, "loss": 0.1022, "step": 298 }, { "epoch": 0.382842509603073, "grad_norm": 0.22935523491614063, "learning_rate": 9.122813169258036e-06, "loss": 0.1093, "step": 299 }, { "epoch": 0.38412291933418696, "grad_norm": 0.24342440641337806, "learning_rate": 9.117115266434875e-06, "loss": 0.1333, "step": 300 }, { "epoch": 0.3854033290653009, "grad_norm": 0.20142459693754752, "learning_rate": 9.111400709151325e-06, "loss": 0.0842, "step": 301 }, { "epoch": 0.3866837387964149, "grad_norm": 0.22689790466060908, "learning_rate": 9.105669520523781e-06, "loss": 0.101, "step": 302 }, { "epoch": 0.3879641485275288, "grad_norm": 0.2516707193506417, "learning_rate": 9.099921723735918e-06, "loss": 0.0964, "step": 303 }, { "epoch": 0.3892445582586428, "grad_norm": 0.23647352099837085, "learning_rate": 9.094157342038588e-06, "loss": 0.0981, "step": 304 }, { "epoch": 0.3905249679897567, "grad_norm": 0.20871088036121524, "learning_rate": 9.088376398749739e-06, "loss": 0.1001, "step": 305 }, { "epoch": 0.39180537772087065, "grad_norm": 0.2441731869064398, "learning_rate": 9.08257891725431e-06, "loss": 0.0942, "step": 306 }, { "epoch": 0.39308578745198464, "grad_norm": 0.19683407057171803, "learning_rate": 9.07676492100414e-06, "loss": 0.0784, "step": 307 }, { "epoch": 0.39436619718309857, "grad_norm": 0.24648657168932037, "learning_rate": 9.070934433517872e-06, "loss": 0.1016, "step": 308 }, { "epoch": 0.39564660691421255, "grad_norm": 0.20143625938934676, "learning_rate": 9.065087478380863e-06, "loss": 0.0904, "step": 309 }, { "epoch": 0.3969270166453265, "grad_norm": 0.22398622215836045, "learning_rate": 9.059224079245079e-06, "loss": 0.0974, "step": 310 }, { "epoch": 0.39820742637644047, "grad_norm": 0.2223390154172052, "learning_rate": 9.05334425982901e-06, "loss": 0.1062, "step": 311 }, { "epoch": 0.3994878361075544, "grad_norm": 0.2357592771569166, "learning_rate": 9.047448043917568e-06, "loss": 0.106, "step": 312 }, { "epoch": 0.4007682458386684, "grad_norm": 0.23728246623229327, "learning_rate": 9.041535455361989e-06, "loss": 0.09, "step": 313 }, { "epoch": 0.4020486555697823, "grad_norm": 0.21270633780249457, "learning_rate": 9.035606518079742e-06, "loss": 0.092, "step": 314 }, { "epoch": 0.4033290653008963, "grad_norm": 0.22389028590019153, "learning_rate": 9.029661256054425e-06, "loss": 0.1159, "step": 315 }, { "epoch": 0.4046094750320102, "grad_norm": 0.25014414377353883, "learning_rate": 9.023699693335678e-06, "loss": 0.0925, "step": 316 }, { "epoch": 0.4058898847631242, "grad_norm": 0.20590033966915014, "learning_rate": 9.01772185403908e-06, "loss": 0.0875, "step": 317 }, { "epoch": 0.40717029449423814, "grad_norm": 0.224920965986267, "learning_rate": 9.011727762346047e-06, "loss": 0.1071, "step": 318 }, { "epoch": 0.4084507042253521, "grad_norm": 0.22084966838043332, "learning_rate": 9.005717442503741e-06, "loss": 0.0984, "step": 319 }, { "epoch": 0.40973111395646605, "grad_norm": 0.2216933383922988, "learning_rate": 8.999690918824972e-06, "loss": 0.1008, "step": 320 }, { "epoch": 0.41101152368758004, "grad_norm": 0.20719411881213862, "learning_rate": 8.993648215688096e-06, "loss": 0.0984, "step": 321 }, { "epoch": 0.41229193341869397, "grad_norm": 0.2644213299145063, "learning_rate": 8.987589357536915e-06, "loss": 0.106, "step": 322 }, { "epoch": 0.41357234314980795, "grad_norm": 0.22337264682998753, "learning_rate": 8.981514368880584e-06, "loss": 0.1089, "step": 323 }, { "epoch": 0.4148527528809219, "grad_norm": 0.2123511889075597, "learning_rate": 8.975423274293509e-06, "loss": 0.1054, "step": 324 }, { "epoch": 0.41613316261203587, "grad_norm": 0.23697329819148216, "learning_rate": 8.969316098415246e-06, "loss": 0.1222, "step": 325 }, { "epoch": 0.4174135723431498, "grad_norm": 0.18718131630579438, "learning_rate": 8.963192865950404e-06, "loss": 0.0795, "step": 326 }, { "epoch": 0.4186939820742638, "grad_norm": 0.20726571155542783, "learning_rate": 8.95705360166854e-06, "loss": 0.0959, "step": 327 }, { "epoch": 0.4199743918053777, "grad_norm": 0.21032068929790337, "learning_rate": 8.950898330404067e-06, "loss": 0.1009, "step": 328 }, { "epoch": 0.4212548015364917, "grad_norm": 0.23159033425909467, "learning_rate": 8.944727077056146e-06, "loss": 0.1127, "step": 329 }, { "epoch": 0.4225352112676056, "grad_norm": 0.2241935349157919, "learning_rate": 8.938539866588593e-06, "loss": 0.0826, "step": 330 }, { "epoch": 0.4238156209987196, "grad_norm": 0.2166364470814423, "learning_rate": 8.932336724029766e-06, "loss": 0.0782, "step": 331 }, { "epoch": 0.42509603072983354, "grad_norm": 0.20410164865529315, "learning_rate": 8.926117674472479e-06, "loss": 0.0963, "step": 332 }, { "epoch": 0.4263764404609475, "grad_norm": 0.2635938648241115, "learning_rate": 8.919882743073885e-06, "loss": 0.1101, "step": 333 }, { "epoch": 0.42765685019206146, "grad_norm": 0.1890391596699968, "learning_rate": 8.91363195505539e-06, "loss": 0.0823, "step": 334 }, { "epoch": 0.42893725992317544, "grad_norm": 0.20593147494385333, "learning_rate": 8.907365335702535e-06, "loss": 0.1025, "step": 335 }, { "epoch": 0.43021766965428937, "grad_norm": 0.24475402626334472, "learning_rate": 8.901082910364907e-06, "loss": 0.0921, "step": 336 }, { "epoch": 0.43149807938540335, "grad_norm": 0.29107939881060624, "learning_rate": 8.894784704456028e-06, "loss": 0.1214, "step": 337 }, { "epoch": 0.4327784891165173, "grad_norm": 0.19547473349388864, "learning_rate": 8.888470743453257e-06, "loss": 0.0824, "step": 338 }, { "epoch": 0.43405889884763127, "grad_norm": 0.2160589501617305, "learning_rate": 8.882141052897683e-06, "loss": 0.1064, "step": 339 }, { "epoch": 0.4353393085787452, "grad_norm": 0.22838595100451395, "learning_rate": 8.875795658394028e-06, "loss": 0.0942, "step": 340 }, { "epoch": 0.43661971830985913, "grad_norm": 0.19616492601477323, "learning_rate": 8.869434585610534e-06, "loss": 0.0894, "step": 341 }, { "epoch": 0.4379001280409731, "grad_norm": 0.19800541842229377, "learning_rate": 8.86305786027887e-06, "loss": 0.0957, "step": 342 }, { "epoch": 0.43918053777208704, "grad_norm": 0.22767369934782125, "learning_rate": 8.856665508194016e-06, "loss": 0.1122, "step": 343 }, { "epoch": 0.44046094750320103, "grad_norm": 0.21910397163597442, "learning_rate": 8.850257555214169e-06, "loss": 0.0995, "step": 344 }, { "epoch": 0.44174135723431496, "grad_norm": 0.2803034838401473, "learning_rate": 8.843834027260635e-06, "loss": 0.0955, "step": 345 }, { "epoch": 0.44302176696542894, "grad_norm": 0.19693583233156847, "learning_rate": 8.837394950317722e-06, "loss": 0.0911, "step": 346 }, { "epoch": 0.44430217669654287, "grad_norm": 0.2142878047895678, "learning_rate": 8.830940350432634e-06, "loss": 0.0897, "step": 347 }, { "epoch": 0.44558258642765686, "grad_norm": 0.22408231174990428, "learning_rate": 8.824470253715373e-06, "loss": 0.0956, "step": 348 }, { "epoch": 0.4468629961587708, "grad_norm": 0.2370957771392853, "learning_rate": 8.817984686338628e-06, "loss": 0.1204, "step": 349 }, { "epoch": 0.44814340588988477, "grad_norm": 0.2140707854268636, "learning_rate": 8.811483674537663e-06, "loss": 0.1013, "step": 350 }, { "epoch": 0.4494238156209987, "grad_norm": 0.19890460187476416, "learning_rate": 8.804967244610224e-06, "loss": 0.0905, "step": 351 }, { "epoch": 0.4507042253521127, "grad_norm": 0.20865814474980504, "learning_rate": 8.798435422916425e-06, "loss": 0.112, "step": 352 }, { "epoch": 0.4519846350832266, "grad_norm": 0.18979477517441368, "learning_rate": 8.791888235878642e-06, "loss": 0.0834, "step": 353 }, { "epoch": 0.4532650448143406, "grad_norm": 0.1980905309599967, "learning_rate": 8.785325709981404e-06, "loss": 0.0852, "step": 354 }, { "epoch": 0.45454545454545453, "grad_norm": 0.23599100359936517, "learning_rate": 8.778747871771293e-06, "loss": 0.1023, "step": 355 }, { "epoch": 0.4558258642765685, "grad_norm": 0.23296475543169726, "learning_rate": 8.772154747856825e-06, "loss": 0.0941, "step": 356 }, { "epoch": 0.45710627400768244, "grad_norm": 0.219868364544871, "learning_rate": 8.76554636490836e-06, "loss": 0.1042, "step": 357 }, { "epoch": 0.45838668373879643, "grad_norm": 0.20723571263385598, "learning_rate": 8.758922749657974e-06, "loss": 0.0953, "step": 358 }, { "epoch": 0.45966709346991036, "grad_norm": 0.22270342093848983, "learning_rate": 8.752283928899362e-06, "loss": 0.0855, "step": 359 }, { "epoch": 0.46094750320102434, "grad_norm": 0.20704359241675452, "learning_rate": 8.745629929487733e-06, "loss": 0.0927, "step": 360 }, { "epoch": 0.4622279129321383, "grad_norm": 0.2176294157303934, "learning_rate": 8.738960778339691e-06, "loss": 0.1, "step": 361 }, { "epoch": 0.46350832266325226, "grad_norm": 0.23050503704277558, "learning_rate": 8.732276502433134e-06, "loss": 0.0813, "step": 362 }, { "epoch": 0.4647887323943662, "grad_norm": 0.29834497338396626, "learning_rate": 8.725577128807144e-06, "loss": 0.1276, "step": 363 }, { "epoch": 0.4660691421254802, "grad_norm": 0.2096869387423583, "learning_rate": 8.71886268456187e-06, "loss": 0.0765, "step": 364 }, { "epoch": 0.4673495518565941, "grad_norm": 0.21980929540701777, "learning_rate": 8.71213319685843e-06, "loss": 0.0864, "step": 365 }, { "epoch": 0.4686299615877081, "grad_norm": 0.2030343689139492, "learning_rate": 8.705388692918794e-06, "loss": 0.0937, "step": 366 }, { "epoch": 0.469910371318822, "grad_norm": 0.20873342981264223, "learning_rate": 8.698629200025674e-06, "loss": 0.0764, "step": 367 }, { "epoch": 0.471190781049936, "grad_norm": 0.2205927043158401, "learning_rate": 8.691854745522416e-06, "loss": 0.1027, "step": 368 }, { "epoch": 0.47247119078104993, "grad_norm": 0.21013716279614206, "learning_rate": 8.685065356812888e-06, "loss": 0.0997, "step": 369 }, { "epoch": 0.4737516005121639, "grad_norm": 0.26189426349253375, "learning_rate": 8.67826106136137e-06, "loss": 0.1005, "step": 370 }, { "epoch": 0.47503201024327785, "grad_norm": 0.20776310617784827, "learning_rate": 8.671441886692445e-06, "loss": 0.1044, "step": 371 }, { "epoch": 0.47631241997439183, "grad_norm": 0.2245580526343855, "learning_rate": 8.664607860390876e-06, "loss": 0.0973, "step": 372 }, { "epoch": 0.47759282970550576, "grad_norm": 0.1870672583287934, "learning_rate": 8.657759010101515e-06, "loss": 0.0754, "step": 373 }, { "epoch": 0.4788732394366197, "grad_norm": 0.1988890388362317, "learning_rate": 8.650895363529172e-06, "loss": 0.0854, "step": 374 }, { "epoch": 0.4801536491677337, "grad_norm": 0.2232419013421965, "learning_rate": 8.644016948438515e-06, "loss": 0.1147, "step": 375 }, { "epoch": 0.4814340588988476, "grad_norm": 0.23790074471335118, "learning_rate": 8.637123792653946e-06, "loss": 0.1045, "step": 376 }, { "epoch": 0.4827144686299616, "grad_norm": 0.23664389350337536, "learning_rate": 8.630215924059506e-06, "loss": 0.1099, "step": 377 }, { "epoch": 0.4839948783610755, "grad_norm": 0.24177438603779688, "learning_rate": 8.623293370598746e-06, "loss": 0.1122, "step": 378 }, { "epoch": 0.4852752880921895, "grad_norm": 0.22199815992256927, "learning_rate": 8.616356160274621e-06, "loss": 0.0939, "step": 379 }, { "epoch": 0.48655569782330343, "grad_norm": 0.25572452383178235, "learning_rate": 8.609404321149373e-06, "loss": 0.1156, "step": 380 }, { "epoch": 0.4878361075544174, "grad_norm": 0.20268581805440308, "learning_rate": 8.602437881344426e-06, "loss": 0.0877, "step": 381 }, { "epoch": 0.48911651728553135, "grad_norm": 0.21489123761960444, "learning_rate": 8.595456869040258e-06, "loss": 0.1075, "step": 382 }, { "epoch": 0.49039692701664533, "grad_norm": 0.24226701775653298, "learning_rate": 8.588461312476305e-06, "loss": 0.1118, "step": 383 }, { "epoch": 0.49167733674775926, "grad_norm": 0.20516313657448784, "learning_rate": 8.581451239950827e-06, "loss": 0.1043, "step": 384 }, { "epoch": 0.49295774647887325, "grad_norm": 0.21150993992952238, "learning_rate": 8.574426679820813e-06, "loss": 0.0958, "step": 385 }, { "epoch": 0.4942381562099872, "grad_norm": 0.20476924773020433, "learning_rate": 8.567387660501852e-06, "loss": 0.0959, "step": 386 }, { "epoch": 0.49551856594110116, "grad_norm": 0.2090032040096484, "learning_rate": 8.560334210468022e-06, "loss": 0.0857, "step": 387 }, { "epoch": 0.4967989756722151, "grad_norm": 0.27512843384692853, "learning_rate": 8.553266358251782e-06, "loss": 0.0999, "step": 388 }, { "epoch": 0.4980793854033291, "grad_norm": 0.21695948228965925, "learning_rate": 8.54618413244384e-06, "loss": 0.1054, "step": 389 }, { "epoch": 0.499359795134443, "grad_norm": 0.2897822611020935, "learning_rate": 8.539087561693059e-06, "loss": 0.1124, "step": 390 }, { "epoch": 0.5006402048655569, "grad_norm": 0.2289635458508745, "learning_rate": 8.53197667470632e-06, "loss": 0.1053, "step": 391 }, { "epoch": 0.5019206145966709, "grad_norm": 0.22621614860347966, "learning_rate": 8.52485150024842e-06, "loss": 0.099, "step": 392 }, { "epoch": 0.5032010243277849, "grad_norm": 0.22311626188509598, "learning_rate": 8.517712067141956e-06, "loss": 0.1216, "step": 393 }, { "epoch": 0.5044814340588989, "grad_norm": 0.23102611913472276, "learning_rate": 8.510558404267197e-06, "loss": 0.102, "step": 394 }, { "epoch": 0.5057618437900128, "grad_norm": 0.20662770441604916, "learning_rate": 8.503390540561975e-06, "loss": 0.0904, "step": 395 }, { "epoch": 0.5070422535211268, "grad_norm": 0.20865218058585458, "learning_rate": 8.496208505021572e-06, "loss": 0.0842, "step": 396 }, { "epoch": 0.5083226632522407, "grad_norm": 0.24514253151577206, "learning_rate": 8.489012326698589e-06, "loss": 0.1251, "step": 397 }, { "epoch": 0.5096030729833547, "grad_norm": 0.20996889547964323, "learning_rate": 8.481802034702843e-06, "loss": 0.096, "step": 398 }, { "epoch": 0.5108834827144686, "grad_norm": 0.23586974216149814, "learning_rate": 8.474577658201243e-06, "loss": 0.0962, "step": 399 }, { "epoch": 0.5121638924455826, "grad_norm": 0.23601595494719851, "learning_rate": 8.467339226417673e-06, "loss": 0.0882, "step": 400 }, { "epoch": 0.5121638924455826, "eval_loss": 0.0953243225812912, "eval_runtime": 10.9425, "eval_samples_per_second": 23.121, "eval_steps_per_second": 5.849, "step": 400 }, { "epoch": 0.5134443021766966, "grad_norm": 0.2527479387241998, "learning_rate": 8.460086768632867e-06, "loss": 0.0956, "step": 401 }, { "epoch": 0.5147247119078106, "grad_norm": 0.22357883043866153, "learning_rate": 8.452820314184306e-06, "loss": 0.0978, "step": 402 }, { "epoch": 0.5160051216389244, "grad_norm": 0.20636666864940245, "learning_rate": 8.445539892466083e-06, "loss": 0.097, "step": 403 }, { "epoch": 0.5172855313700384, "grad_norm": 0.2040150058028145, "learning_rate": 8.438245532928793e-06, "loss": 0.1016, "step": 404 }, { "epoch": 0.5185659411011524, "grad_norm": 0.2211749570109207, "learning_rate": 8.43093726507941e-06, "loss": 0.112, "step": 405 }, { "epoch": 0.5198463508322664, "grad_norm": 0.2529422062981371, "learning_rate": 8.423615118481176e-06, "loss": 0.1016, "step": 406 }, { "epoch": 0.5211267605633803, "grad_norm": 0.2000274835791186, "learning_rate": 8.416279122753468e-06, "loss": 0.0941, "step": 407 }, { "epoch": 0.5224071702944942, "grad_norm": 0.17476702718511677, "learning_rate": 8.40892930757169e-06, "loss": 0.0628, "step": 408 }, { "epoch": 0.5236875800256082, "grad_norm": 0.2002581546723564, "learning_rate": 8.401565702667142e-06, "loss": 0.0887, "step": 409 }, { "epoch": 0.5249679897567221, "grad_norm": 0.22385494212767074, "learning_rate": 8.394188337826915e-06, "loss": 0.1016, "step": 410 }, { "epoch": 0.5262483994878361, "grad_norm": 0.22471356538224704, "learning_rate": 8.386797242893754e-06, "loss": 0.0911, "step": 411 }, { "epoch": 0.5275288092189501, "grad_norm": 0.20965600186345326, "learning_rate": 8.379392447765946e-06, "loss": 0.0935, "step": 412 }, { "epoch": 0.528809218950064, "grad_norm": 0.22543725112740778, "learning_rate": 8.3719739823972e-06, "loss": 0.118, "step": 413 }, { "epoch": 0.5300896286811779, "grad_norm": 0.22662611078985517, "learning_rate": 8.364541876796521e-06, "loss": 0.0937, "step": 414 }, { "epoch": 0.5313700384122919, "grad_norm": 0.21414527175137116, "learning_rate": 8.357096161028094e-06, "loss": 0.0934, "step": 415 }, { "epoch": 0.5326504481434059, "grad_norm": 0.2034305852191668, "learning_rate": 8.349636865211158e-06, "loss": 0.0929, "step": 416 }, { "epoch": 0.5339308578745199, "grad_norm": 0.2623912475905132, "learning_rate": 8.342164019519882e-06, "loss": 0.1045, "step": 417 }, { "epoch": 0.5352112676056338, "grad_norm": 0.19984850686210495, "learning_rate": 8.334677654183254e-06, "loss": 0.0874, "step": 418 }, { "epoch": 0.5364916773367477, "grad_norm": 0.20905330007716028, "learning_rate": 8.327177799484949e-06, "loss": 0.0891, "step": 419 }, { "epoch": 0.5377720870678617, "grad_norm": 0.1995885734271121, "learning_rate": 8.319664485763201e-06, "loss": 0.0945, "step": 420 }, { "epoch": 0.5390524967989757, "grad_norm": 0.20714394900602223, "learning_rate": 8.312137743410702e-06, "loss": 0.1066, "step": 421 }, { "epoch": 0.5403329065300896, "grad_norm": 0.19949795715267285, "learning_rate": 8.304597602874454e-06, "loss": 0.0933, "step": 422 }, { "epoch": 0.5416133162612036, "grad_norm": 0.2045834361350689, "learning_rate": 8.29704409465566e-06, "loss": 0.0886, "step": 423 }, { "epoch": 0.5428937259923176, "grad_norm": 0.20782115983693134, "learning_rate": 8.289477249309599e-06, "loss": 0.095, "step": 424 }, { "epoch": 0.5441741357234315, "grad_norm": 0.24168237236018494, "learning_rate": 8.281897097445502e-06, "loss": 0.0903, "step": 425 }, { "epoch": 0.5454545454545454, "grad_norm": 0.19467090810644833, "learning_rate": 8.274303669726427e-06, "loss": 0.078, "step": 426 }, { "epoch": 0.5467349551856594, "grad_norm": 0.23764416891238427, "learning_rate": 8.266696996869131e-06, "loss": 0.1163, "step": 427 }, { "epoch": 0.5480153649167734, "grad_norm": 0.23901360164084473, "learning_rate": 8.259077109643952e-06, "loss": 0.0867, "step": 428 }, { "epoch": 0.5492957746478874, "grad_norm": 0.21395868890610306, "learning_rate": 8.251444038874685e-06, "loss": 0.1164, "step": 429 }, { "epoch": 0.5505761843790012, "grad_norm": 0.24138803207819554, "learning_rate": 8.243797815438455e-06, "loss": 0.1076, "step": 430 }, { "epoch": 0.5518565941101152, "grad_norm": 0.22206056326219661, "learning_rate": 8.236138470265587e-06, "loss": 0.0921, "step": 431 }, { "epoch": 0.5531370038412292, "grad_norm": 0.21915433160629, "learning_rate": 8.228466034339488e-06, "loss": 0.1134, "step": 432 }, { "epoch": 0.5544174135723432, "grad_norm": 0.18646861464677633, "learning_rate": 8.220780538696525e-06, "loss": 0.0762, "step": 433 }, { "epoch": 0.5556978233034571, "grad_norm": 0.2311473218069965, "learning_rate": 8.213082014425884e-06, "loss": 0.11, "step": 434 }, { "epoch": 0.5569782330345711, "grad_norm": 0.18507657813087047, "learning_rate": 8.205370492669462e-06, "loss": 0.0728, "step": 435 }, { "epoch": 0.558258642765685, "grad_norm": 0.2552272774707362, "learning_rate": 8.197646004621729e-06, "loss": 0.1113, "step": 436 }, { "epoch": 0.559539052496799, "grad_norm": 0.2152057162713566, "learning_rate": 8.189908581529605e-06, "loss": 0.0892, "step": 437 }, { "epoch": 0.5608194622279129, "grad_norm": 0.20596775947008292, "learning_rate": 8.182158254692342e-06, "loss": 0.0883, "step": 438 }, { "epoch": 0.5620998719590269, "grad_norm": 0.19226559355391643, "learning_rate": 8.17439505546138e-06, "loss": 0.0754, "step": 439 }, { "epoch": 0.5633802816901409, "grad_norm": 0.21360417020543004, "learning_rate": 8.166619015240236e-06, "loss": 0.1009, "step": 440 }, { "epoch": 0.5646606914212549, "grad_norm": 0.20799065213817622, "learning_rate": 8.15883016548437e-06, "loss": 0.0953, "step": 441 }, { "epoch": 0.5659411011523687, "grad_norm": 0.21480110036067712, "learning_rate": 8.15102853770106e-06, "loss": 0.1041, "step": 442 }, { "epoch": 0.5672215108834827, "grad_norm": 0.21957453232534863, "learning_rate": 8.14321416344927e-06, "loss": 0.1098, "step": 443 }, { "epoch": 0.5685019206145967, "grad_norm": 0.23249177289978376, "learning_rate": 8.135387074339528e-06, "loss": 0.1013, "step": 444 }, { "epoch": 0.5697823303457106, "grad_norm": 0.23489694698156427, "learning_rate": 8.127547302033799e-06, "loss": 0.1079, "step": 445 }, { "epoch": 0.5710627400768246, "grad_norm": 0.21289867857750788, "learning_rate": 8.119694878245344e-06, "loss": 0.0936, "step": 446 }, { "epoch": 0.5723431498079385, "grad_norm": 0.1942745282647202, "learning_rate": 8.111829834738611e-06, "loss": 0.0846, "step": 447 }, { "epoch": 0.5736235595390525, "grad_norm": 0.2057963063742497, "learning_rate": 8.103952203329098e-06, "loss": 0.1051, "step": 448 }, { "epoch": 0.5749039692701664, "grad_norm": 0.20528834957887965, "learning_rate": 8.096062015883214e-06, "loss": 0.0856, "step": 449 }, { "epoch": 0.5761843790012804, "grad_norm": 0.22968980433586125, "learning_rate": 8.088159304318166e-06, "loss": 0.1143, "step": 450 }, { "epoch": 0.5774647887323944, "grad_norm": 0.202166941211175, "learning_rate": 8.080244100601822e-06, "loss": 0.0858, "step": 451 }, { "epoch": 0.5787451984635084, "grad_norm": 0.20041270803672992, "learning_rate": 8.072316436752583e-06, "loss": 0.0871, "step": 452 }, { "epoch": 0.5800256081946222, "grad_norm": 0.21201966594267804, "learning_rate": 8.064376344839254e-06, "loss": 0.0913, "step": 453 }, { "epoch": 0.5813060179257362, "grad_norm": 0.19621292426629885, "learning_rate": 8.056423856980908e-06, "loss": 0.0808, "step": 454 }, { "epoch": 0.5825864276568502, "grad_norm": 0.24211828562004795, "learning_rate": 8.048459005346774e-06, "loss": 0.1089, "step": 455 }, { "epoch": 0.5838668373879642, "grad_norm": 0.21643639083406002, "learning_rate": 8.040481822156083e-06, "loss": 0.0905, "step": 456 }, { "epoch": 0.5851472471190781, "grad_norm": 0.20491355557771065, "learning_rate": 8.032492339677957e-06, "loss": 0.0897, "step": 457 }, { "epoch": 0.586427656850192, "grad_norm": 0.2083280887844811, "learning_rate": 8.024490590231264e-06, "loss": 0.081, "step": 458 }, { "epoch": 0.587708066581306, "grad_norm": 0.1970749192814109, "learning_rate": 8.016476606184498e-06, "loss": 0.086, "step": 459 }, { "epoch": 0.58898847631242, "grad_norm": 0.22784064187381797, "learning_rate": 8.008450419955645e-06, "loss": 0.0974, "step": 460 }, { "epoch": 0.5902688860435339, "grad_norm": 0.23769910185699117, "learning_rate": 8.000412064012052e-06, "loss": 0.1012, "step": 461 }, { "epoch": 0.5915492957746479, "grad_norm": 0.24167925966690462, "learning_rate": 7.992361570870289e-06, "loss": 0.1007, "step": 462 }, { "epoch": 0.5928297055057619, "grad_norm": 0.2151209532080606, "learning_rate": 7.984298973096026e-06, "loss": 0.104, "step": 463 }, { "epoch": 0.5941101152368758, "grad_norm": 0.19647638461355646, "learning_rate": 7.976224303303902e-06, "loss": 0.0852, "step": 464 }, { "epoch": 0.5953905249679897, "grad_norm": 0.20648293767220008, "learning_rate": 7.968137594157382e-06, "loss": 0.0924, "step": 465 }, { "epoch": 0.5966709346991037, "grad_norm": 0.2091738861949434, "learning_rate": 7.96003887836864e-06, "loss": 0.0931, "step": 466 }, { "epoch": 0.5979513444302177, "grad_norm": 0.22544598345783876, "learning_rate": 7.951928188698416e-06, "loss": 0.1094, "step": 467 }, { "epoch": 0.5992317541613317, "grad_norm": 0.20826863859494413, "learning_rate": 7.943805557955887e-06, "loss": 0.1, "step": 468 }, { "epoch": 0.6005121638924455, "grad_norm": 0.2036216557318119, "learning_rate": 7.935671018998529e-06, "loss": 0.0951, "step": 469 }, { "epoch": 0.6017925736235595, "grad_norm": 0.195943096469429, "learning_rate": 7.927524604731994e-06, "loss": 0.0793, "step": 470 }, { "epoch": 0.6030729833546735, "grad_norm": 0.19704009871070222, "learning_rate": 7.919366348109971e-06, "loss": 0.0836, "step": 471 }, { "epoch": 0.6043533930857875, "grad_norm": 0.20110221390721184, "learning_rate": 7.911196282134053e-06, "loss": 0.0809, "step": 472 }, { "epoch": 0.6056338028169014, "grad_norm": 0.20664316332982904, "learning_rate": 7.903014439853605e-06, "loss": 0.0978, "step": 473 }, { "epoch": 0.6069142125480154, "grad_norm": 0.2095969593218784, "learning_rate": 7.894820854365626e-06, "loss": 0.0955, "step": 474 }, { "epoch": 0.6081946222791293, "grad_norm": 0.20756924021310047, "learning_rate": 7.88661555881462e-06, "loss": 0.0878, "step": 475 }, { "epoch": 0.6094750320102432, "grad_norm": 0.20900570586930392, "learning_rate": 7.878398586392461e-06, "loss": 0.0998, "step": 476 }, { "epoch": 0.6107554417413572, "grad_norm": 0.20903135620142851, "learning_rate": 7.870169970338262e-06, "loss": 0.0951, "step": 477 }, { "epoch": 0.6120358514724712, "grad_norm": 0.2826378237305601, "learning_rate": 7.861929743938225e-06, "loss": 0.0815, "step": 478 }, { "epoch": 0.6133162612035852, "grad_norm": 0.21360751039074696, "learning_rate": 7.85367794052553e-06, "loss": 0.1076, "step": 479 }, { "epoch": 0.614596670934699, "grad_norm": 0.21477293125992777, "learning_rate": 7.845414593480183e-06, "loss": 0.105, "step": 480 }, { "epoch": 0.615877080665813, "grad_norm": 0.2023457664473389, "learning_rate": 7.837139736228884e-06, "loss": 0.0981, "step": 481 }, { "epoch": 0.617157490396927, "grad_norm": 0.19808695693582093, "learning_rate": 7.828853402244897e-06, "loss": 0.0826, "step": 482 }, { "epoch": 0.618437900128041, "grad_norm": 0.2392208580842326, "learning_rate": 7.82055562504791e-06, "loss": 0.0906, "step": 483 }, { "epoch": 0.6197183098591549, "grad_norm": 0.253354588585392, "learning_rate": 7.812246438203905e-06, "loss": 0.1153, "step": 484 }, { "epoch": 0.6209987195902689, "grad_norm": 0.2058808678485348, "learning_rate": 7.80392587532501e-06, "loss": 0.0962, "step": 485 }, { "epoch": 0.6222791293213829, "grad_norm": 0.21930171242451035, "learning_rate": 7.795593970069373e-06, "loss": 0.0983, "step": 486 }, { "epoch": 0.6235595390524968, "grad_norm": 0.257674789838677, "learning_rate": 7.787250756141034e-06, "loss": 0.0892, "step": 487 }, { "epoch": 0.6248399487836107, "grad_norm": 0.1984690021734659, "learning_rate": 7.778896267289766e-06, "loss": 0.0855, "step": 488 }, { "epoch": 0.6261203585147247, "grad_norm": 0.21113978957791288, "learning_rate": 7.770530537310956e-06, "loss": 0.0997, "step": 489 }, { "epoch": 0.6274007682458387, "grad_norm": 0.21065522409407045, "learning_rate": 7.762153600045466e-06, "loss": 0.099, "step": 490 }, { "epoch": 0.6286811779769527, "grad_norm": 0.2904918499812123, "learning_rate": 7.753765489379488e-06, "loss": 0.0946, "step": 491 }, { "epoch": 0.6299615877080665, "grad_norm": 0.21075102006023247, "learning_rate": 7.745366239244416e-06, "loss": 0.1003, "step": 492 }, { "epoch": 0.6312419974391805, "grad_norm": 0.2094620657071062, "learning_rate": 7.736955883616706e-06, "loss": 0.096, "step": 493 }, { "epoch": 0.6325224071702945, "grad_norm": 0.26055011504647846, "learning_rate": 7.728534456517733e-06, "loss": 0.0984, "step": 494 }, { "epoch": 0.6338028169014085, "grad_norm": 0.19722967204753608, "learning_rate": 7.720101992013661e-06, "loss": 0.0782, "step": 495 }, { "epoch": 0.6350832266325224, "grad_norm": 0.21570241138705967, "learning_rate": 7.711658524215306e-06, "loss": 0.0916, "step": 496 }, { "epoch": 0.6363636363636364, "grad_norm": 0.2161335833921179, "learning_rate": 7.703204087277989e-06, "loss": 0.0858, "step": 497 }, { "epoch": 0.6376440460947503, "grad_norm": 0.2808761813831047, "learning_rate": 7.694738715401401e-06, "loss": 0.1053, "step": 498 }, { "epoch": 0.6389244558258643, "grad_norm": 0.22132329473438767, "learning_rate": 7.686262442829478e-06, "loss": 0.1021, "step": 499 }, { "epoch": 0.6402048655569782, "grad_norm": 0.2011613614955789, "learning_rate": 7.677775303850237e-06, "loss": 0.0933, "step": 500 }, { "epoch": 0.6414852752880922, "grad_norm": 0.20385002196453325, "learning_rate": 7.669277332795659e-06, "loss": 0.0871, "step": 501 }, { "epoch": 0.6427656850192062, "grad_norm": 0.20983282246170648, "learning_rate": 7.660768564041546e-06, "loss": 0.0908, "step": 502 }, { "epoch": 0.6440460947503202, "grad_norm": 0.21940508259983388, "learning_rate": 7.652249032007368e-06, "loss": 0.0919, "step": 503 }, { "epoch": 0.645326504481434, "grad_norm": 0.2151932262761777, "learning_rate": 7.64371877115615e-06, "loss": 0.0894, "step": 504 }, { "epoch": 0.646606914212548, "grad_norm": 0.1935889774279838, "learning_rate": 7.635177815994299e-06, "loss": 0.0801, "step": 505 }, { "epoch": 0.647887323943662, "grad_norm": 0.2054903357114968, "learning_rate": 7.626626201071494e-06, "loss": 0.0852, "step": 506 }, { "epoch": 0.649167733674776, "grad_norm": 0.21402884357686158, "learning_rate": 7.618063960980535e-06, "loss": 0.0952, "step": 507 }, { "epoch": 0.6504481434058899, "grad_norm": 0.2282304878907227, "learning_rate": 7.609491130357196e-06, "loss": 0.1029, "step": 508 }, { "epoch": 0.6517285531370038, "grad_norm": 0.20385903238478356, "learning_rate": 7.600907743880097e-06, "loss": 0.0799, "step": 509 }, { "epoch": 0.6530089628681178, "grad_norm": 0.22747870669220993, "learning_rate": 7.592313836270555e-06, "loss": 0.1065, "step": 510 }, { "epoch": 0.6542893725992317, "grad_norm": 0.23493760126598032, "learning_rate": 7.583709442292451e-06, "loss": 0.1061, "step": 511 }, { "epoch": 0.6555697823303457, "grad_norm": 0.1908273794910358, "learning_rate": 7.575094596752078e-06, "loss": 0.0758, "step": 512 }, { "epoch": 0.6568501920614597, "grad_norm": 0.2225643172164682, "learning_rate": 7.566469334498014e-06, "loss": 0.0975, "step": 513 }, { "epoch": 0.6581306017925737, "grad_norm": 0.19316186114530703, "learning_rate": 7.5578336904209744e-06, "loss": 0.0781, "step": 514 }, { "epoch": 0.6594110115236875, "grad_norm": 0.2328384811760853, "learning_rate": 7.5491876994536675e-06, "loss": 0.0953, "step": 515 }, { "epoch": 0.6606914212548015, "grad_norm": 0.22832992167048238, "learning_rate": 7.540531396570656e-06, "loss": 0.0949, "step": 516 }, { "epoch": 0.6619718309859155, "grad_norm": 0.20966509858283341, "learning_rate": 7.53186481678822e-06, "loss": 0.0821, "step": 517 }, { "epoch": 0.6632522407170295, "grad_norm": 0.22806117259403608, "learning_rate": 7.523187995164209e-06, "loss": 0.1096, "step": 518 }, { "epoch": 0.6645326504481434, "grad_norm": 0.2247600915988335, "learning_rate": 7.514500966797904e-06, "loss": 0.1099, "step": 519 }, { "epoch": 0.6658130601792573, "grad_norm": 0.1988568898385622, "learning_rate": 7.5058037668298734e-06, "loss": 0.0805, "step": 520 }, { "epoch": 0.6670934699103713, "grad_norm": 0.21354304234528193, "learning_rate": 7.497096430441829e-06, "loss": 0.0878, "step": 521 }, { "epoch": 0.6683738796414853, "grad_norm": 0.1963034298194932, "learning_rate": 7.488378992856491e-06, "loss": 0.0796, "step": 522 }, { "epoch": 0.6696542893725992, "grad_norm": 0.19696526222751762, "learning_rate": 7.479651489337436e-06, "loss": 0.0826, "step": 523 }, { "epoch": 0.6709346991037132, "grad_norm": 0.19898887452632463, "learning_rate": 7.470913955188963e-06, "loss": 0.0827, "step": 524 }, { "epoch": 0.6722151088348272, "grad_norm": 0.1966649984219717, "learning_rate": 7.4621664257559455e-06, "loss": 0.0812, "step": 525 }, { "epoch": 0.6734955185659411, "grad_norm": 0.22538776073774067, "learning_rate": 7.453408936423688e-06, "loss": 0.1137, "step": 526 }, { "epoch": 0.674775928297055, "grad_norm": 0.2114568329768989, "learning_rate": 7.444641522617788e-06, "loss": 0.0883, "step": 527 }, { "epoch": 0.676056338028169, "grad_norm": 0.2237048408438941, "learning_rate": 7.4358642198039835e-06, "loss": 0.0979, "step": 528 }, { "epoch": 0.677336747759283, "grad_norm": 0.20864640335796314, "learning_rate": 7.4270770634880205e-06, "loss": 0.1008, "step": 529 }, { "epoch": 0.678617157490397, "grad_norm": 0.19941053685600907, "learning_rate": 7.418280089215503e-06, "loss": 0.0776, "step": 530 }, { "epoch": 0.6798975672215108, "grad_norm": 0.22599519749770347, "learning_rate": 7.409473332571751e-06, "loss": 0.0985, "step": 531 }, { "epoch": 0.6811779769526248, "grad_norm": 0.20104387514890038, "learning_rate": 7.4006568291816514e-06, "loss": 0.0797, "step": 532 }, { "epoch": 0.6824583866837388, "grad_norm": 0.21812279021697822, "learning_rate": 7.391830614709524e-06, "loss": 0.0865, "step": 533 }, { "epoch": 0.6837387964148528, "grad_norm": 0.2416013962731083, "learning_rate": 7.3829947248589695e-06, "loss": 0.1127, "step": 534 }, { "epoch": 0.6850192061459667, "grad_norm": 0.2003919101511796, "learning_rate": 7.374149195372726e-06, "loss": 0.0874, "step": 535 }, { "epoch": 0.6862996158770807, "grad_norm": 0.19590905254019728, "learning_rate": 7.365294062032529e-06, "loss": 0.0852, "step": 536 }, { "epoch": 0.6875800256081946, "grad_norm": 0.26599802437986486, "learning_rate": 7.356429360658959e-06, "loss": 0.0927, "step": 537 }, { "epoch": 0.6888604353393086, "grad_norm": 0.19193172004383002, "learning_rate": 7.347555127111304e-06, "loss": 0.0851, "step": 538 }, { "epoch": 0.6901408450704225, "grad_norm": 0.20793145756385153, "learning_rate": 7.338671397287409e-06, "loss": 0.0894, "step": 539 }, { "epoch": 0.6914212548015365, "grad_norm": 0.2055106008906945, "learning_rate": 7.329778207123537e-06, "loss": 0.0853, "step": 540 }, { "epoch": 0.6927016645326505, "grad_norm": 0.20829375551640006, "learning_rate": 7.320875592594214e-06, "loss": 0.0886, "step": 541 }, { "epoch": 0.6939820742637645, "grad_norm": 0.21290514469654984, "learning_rate": 7.311963589712096e-06, "loss": 0.0999, "step": 542 }, { "epoch": 0.6952624839948783, "grad_norm": 0.21149698942960687, "learning_rate": 7.303042234527811e-06, "loss": 0.0804, "step": 543 }, { "epoch": 0.6965428937259923, "grad_norm": 0.21635596222694858, "learning_rate": 7.2941115631298195e-06, "loss": 0.0925, "step": 544 }, { "epoch": 0.6978233034571063, "grad_norm": 0.22404428440527097, "learning_rate": 7.285171611644269e-06, "loss": 0.0927, "step": 545 }, { "epoch": 0.6991037131882202, "grad_norm": 0.22971347222613758, "learning_rate": 7.276222416234851e-06, "loss": 0.1051, "step": 546 }, { "epoch": 0.7003841229193342, "grad_norm": 0.21736571960959283, "learning_rate": 7.2672640131026406e-06, "loss": 0.1044, "step": 547 }, { "epoch": 0.7016645326504481, "grad_norm": 0.2309133152346766, "learning_rate": 7.258296438485967e-06, "loss": 0.0855, "step": 548 }, { "epoch": 0.7029449423815621, "grad_norm": 0.19083944951400558, "learning_rate": 7.249319728660258e-06, "loss": 0.0763, "step": 549 }, { "epoch": 0.704225352112676, "grad_norm": 0.20534768775399787, "learning_rate": 7.240333919937893e-06, "loss": 0.0995, "step": 550 }, { "epoch": 0.70550576184379, "grad_norm": 0.2146587648228196, "learning_rate": 7.231339048668062e-06, "loss": 0.0811, "step": 551 }, { "epoch": 0.706786171574904, "grad_norm": 0.21580838781900943, "learning_rate": 7.222335151236611e-06, "loss": 0.0976, "step": 552 }, { "epoch": 0.708066581306018, "grad_norm": 0.22033426923943317, "learning_rate": 7.213322264065899e-06, "loss": 0.098, "step": 553 }, { "epoch": 0.7093469910371318, "grad_norm": 0.2178216158240516, "learning_rate": 7.204300423614653e-06, "loss": 0.1, "step": 554 }, { "epoch": 0.7106274007682458, "grad_norm": 0.21815161265725866, "learning_rate": 7.1952696663778125e-06, "loss": 0.091, "step": 555 }, { "epoch": 0.7119078104993598, "grad_norm": 0.227585489329681, "learning_rate": 7.18623002888639e-06, "loss": 0.1045, "step": 556 }, { "epoch": 0.7131882202304738, "grad_norm": 0.2250273320243543, "learning_rate": 7.177181547707324e-06, "loss": 0.1028, "step": 557 }, { "epoch": 0.7144686299615877, "grad_norm": 0.20229478695946837, "learning_rate": 7.168124259443321e-06, "loss": 0.0765, "step": 558 }, { "epoch": 0.7157490396927016, "grad_norm": 0.21027769617724618, "learning_rate": 7.159058200732715e-06, "loss": 0.0958, "step": 559 }, { "epoch": 0.7170294494238156, "grad_norm": 0.2320845186272795, "learning_rate": 7.149983408249318e-06, "loss": 0.0844, "step": 560 }, { "epoch": 0.7183098591549296, "grad_norm": 0.1893081108690409, "learning_rate": 7.140899918702276e-06, "loss": 0.0676, "step": 561 }, { "epoch": 0.7195902688860435, "grad_norm": 0.2368881469429254, "learning_rate": 7.131807768835913e-06, "loss": 0.0973, "step": 562 }, { "epoch": 0.7208706786171575, "grad_norm": 0.2255545078751757, "learning_rate": 7.122706995429583e-06, "loss": 0.0988, "step": 563 }, { "epoch": 0.7221510883482715, "grad_norm": 0.30179835389551113, "learning_rate": 7.113597635297527e-06, "loss": 0.1105, "step": 564 }, { "epoch": 0.7234314980793854, "grad_norm": 0.1929990100979232, "learning_rate": 7.104479725288719e-06, "loss": 0.0779, "step": 565 }, { "epoch": 0.7247119078104993, "grad_norm": 0.19512613499490358, "learning_rate": 7.095353302286722e-06, "loss": 0.0866, "step": 566 }, { "epoch": 0.7259923175416133, "grad_norm": 0.21160236355731724, "learning_rate": 7.086218403209531e-06, "loss": 0.0815, "step": 567 }, { "epoch": 0.7272727272727273, "grad_norm": 0.20931989409566873, "learning_rate": 7.0770750650094335e-06, "loss": 0.0836, "step": 568 }, { "epoch": 0.7285531370038413, "grad_norm": 0.2059789111761751, "learning_rate": 7.067923324672848e-06, "loss": 0.0812, "step": 569 }, { "epoch": 0.7298335467349552, "grad_norm": 0.19372432229163275, "learning_rate": 7.0587632192201885e-06, "loss": 0.081, "step": 570 }, { "epoch": 0.7311139564660691, "grad_norm": 0.2254142725863397, "learning_rate": 7.0495947857057015e-06, "loss": 0.0939, "step": 571 }, { "epoch": 0.7323943661971831, "grad_norm": 0.21855886968387497, "learning_rate": 7.040418061217325e-06, "loss": 0.1041, "step": 572 }, { "epoch": 0.7336747759282971, "grad_norm": 0.21137973932722015, "learning_rate": 7.031233082876535e-06, "loss": 0.0964, "step": 573 }, { "epoch": 0.734955185659411, "grad_norm": 0.2248442740548843, "learning_rate": 7.022039887838197e-06, "loss": 0.0916, "step": 574 }, { "epoch": 0.736235595390525, "grad_norm": 0.22452787260185436, "learning_rate": 7.012838513290409e-06, "loss": 0.0872, "step": 575 }, { "epoch": 0.737516005121639, "grad_norm": 0.20489774532875812, "learning_rate": 7.003628996454363e-06, "loss": 0.0881, "step": 576 }, { "epoch": 0.7387964148527529, "grad_norm": 0.3496915377744987, "learning_rate": 6.9944113745841845e-06, "loss": 0.0937, "step": 577 }, { "epoch": 0.7400768245838668, "grad_norm": 0.2696186131970291, "learning_rate": 6.985185684966791e-06, "loss": 0.1118, "step": 578 }, { "epoch": 0.7413572343149808, "grad_norm": 0.20438432431319165, "learning_rate": 6.975951964921726e-06, "loss": 0.0787, "step": 579 }, { "epoch": 0.7426376440460948, "grad_norm": 0.20476309648260704, "learning_rate": 6.966710251801022e-06, "loss": 0.0808, "step": 580 }, { "epoch": 0.7439180537772087, "grad_norm": 0.1978584969954892, "learning_rate": 6.9574605829890484e-06, "loss": 0.0803, "step": 581 }, { "epoch": 0.7451984635083226, "grad_norm": 0.22445872822384358, "learning_rate": 6.948202995902351e-06, "loss": 0.1082, "step": 582 }, { "epoch": 0.7464788732394366, "grad_norm": 0.20789187703711318, "learning_rate": 6.938937527989511e-06, "loss": 0.0918, "step": 583 }, { "epoch": 0.7477592829705506, "grad_norm": 0.20694696287489722, "learning_rate": 6.929664216730987e-06, "loss": 0.0873, "step": 584 }, { "epoch": 0.7490396927016645, "grad_norm": 0.20963959087622394, "learning_rate": 6.920383099638964e-06, "loss": 0.1004, "step": 585 }, { "epoch": 0.7503201024327785, "grad_norm": 0.20237436777704496, "learning_rate": 6.911094214257204e-06, "loss": 0.0857, "step": 586 }, { "epoch": 0.7516005121638925, "grad_norm": 0.20077278572270885, "learning_rate": 6.901797598160895e-06, "loss": 0.0964, "step": 587 }, { "epoch": 0.7528809218950064, "grad_norm": 0.20153256223041455, "learning_rate": 6.892493288956496e-06, "loss": 0.0778, "step": 588 }, { "epoch": 0.7541613316261203, "grad_norm": 0.2264582441525431, "learning_rate": 6.8831813242815834e-06, "loss": 0.103, "step": 589 }, { "epoch": 0.7554417413572343, "grad_norm": 0.20849380478870774, "learning_rate": 6.8738617418047046e-06, "loss": 0.0855, "step": 590 }, { "epoch": 0.7567221510883483, "grad_norm": 0.19956574110340775, "learning_rate": 6.864534579225221e-06, "loss": 0.0881, "step": 591 }, { "epoch": 0.7580025608194623, "grad_norm": 0.20685218997163107, "learning_rate": 6.8551998742731565e-06, "loss": 0.0822, "step": 592 }, { "epoch": 0.7592829705505761, "grad_norm": 0.2113261469180614, "learning_rate": 6.845857664709048e-06, "loss": 0.0972, "step": 593 }, { "epoch": 0.7605633802816901, "grad_norm": 0.23127404291692885, "learning_rate": 6.836507988323785e-06, "loss": 0.0961, "step": 594 }, { "epoch": 0.7618437900128041, "grad_norm": 0.2224645074748306, "learning_rate": 6.827150882938466e-06, "loss": 0.1031, "step": 595 }, { "epoch": 0.7631241997439181, "grad_norm": 0.24925020559207858, "learning_rate": 6.8177863864042386e-06, "loss": 0.0887, "step": 596 }, { "epoch": 0.764404609475032, "grad_norm": 0.22780135728146356, "learning_rate": 6.808414536602149e-06, "loss": 0.1147, "step": 597 }, { "epoch": 0.765685019206146, "grad_norm": 0.2065992672399531, "learning_rate": 6.799035371442992e-06, "loss": 0.0893, "step": 598 }, { "epoch": 0.7669654289372599, "grad_norm": 0.22445619670422193, "learning_rate": 6.789648928867147e-06, "loss": 0.0904, "step": 599 }, { "epoch": 0.7682458386683739, "grad_norm": 0.22634612840461343, "learning_rate": 6.7802552468444425e-06, "loss": 0.1118, "step": 600 }, { "epoch": 0.7682458386683739, "eval_loss": 0.0902518481016159, "eval_runtime": 10.9454, "eval_samples_per_second": 23.115, "eval_steps_per_second": 5.847, "step": 600 }, { "epoch": 0.7695262483994878, "grad_norm": 0.21526250908775035, "learning_rate": 6.770854363373983e-06, "loss": 0.1072, "step": 601 }, { "epoch": 0.7708066581306018, "grad_norm": 0.2150822057230654, "learning_rate": 6.761446316484006e-06, "loss": 0.0921, "step": 602 }, { "epoch": 0.7720870678617158, "grad_norm": 0.2085891608181676, "learning_rate": 6.752031144231727e-06, "loss": 0.0924, "step": 603 }, { "epoch": 0.7733674775928298, "grad_norm": 0.19668957171199403, "learning_rate": 6.742608884703189e-06, "loss": 0.088, "step": 604 }, { "epoch": 0.7746478873239436, "grad_norm": 0.2169366828618394, "learning_rate": 6.733179576013098e-06, "loss": 0.0984, "step": 605 }, { "epoch": 0.7759282970550576, "grad_norm": 0.1926979530266659, "learning_rate": 6.723743256304677e-06, "loss": 0.0818, "step": 606 }, { "epoch": 0.7772087067861716, "grad_norm": 0.21390538353031593, "learning_rate": 6.714299963749509e-06, "loss": 0.0999, "step": 607 }, { "epoch": 0.7784891165172856, "grad_norm": 0.22738476791060505, "learning_rate": 6.704849736547389e-06, "loss": 0.1054, "step": 608 }, { "epoch": 0.7797695262483995, "grad_norm": 0.1969274259455136, "learning_rate": 6.695392612926157e-06, "loss": 0.0906, "step": 609 }, { "epoch": 0.7810499359795134, "grad_norm": 0.2016657784028972, "learning_rate": 6.685928631141553e-06, "loss": 0.0885, "step": 610 }, { "epoch": 0.7823303457106274, "grad_norm": 0.20469598912113135, "learning_rate": 6.67645782947706e-06, "loss": 0.0749, "step": 611 }, { "epoch": 0.7836107554417413, "grad_norm": 0.21524706800995877, "learning_rate": 6.666980246243747e-06, "loss": 0.1015, "step": 612 }, { "epoch": 0.7848911651728553, "grad_norm": 0.21560513689074232, "learning_rate": 6.6574959197801185e-06, "loss": 0.0836, "step": 613 }, { "epoch": 0.7861715749039693, "grad_norm": 0.20334524850100763, "learning_rate": 6.648004888451952e-06, "loss": 0.0906, "step": 614 }, { "epoch": 0.7874519846350833, "grad_norm": 0.2041190838260672, "learning_rate": 6.638507190652153e-06, "loss": 0.1028, "step": 615 }, { "epoch": 0.7887323943661971, "grad_norm": 0.2220679161360237, "learning_rate": 6.629002864800589e-06, "loss": 0.1045, "step": 616 }, { "epoch": 0.7900128040973111, "grad_norm": 0.2047442699012787, "learning_rate": 6.619491949343941e-06, "loss": 0.0937, "step": 617 }, { "epoch": 0.7912932138284251, "grad_norm": 0.22071034753149052, "learning_rate": 6.609974482755548e-06, "loss": 0.0984, "step": 618 }, { "epoch": 0.7925736235595391, "grad_norm": 0.19786958197863583, "learning_rate": 6.600450503535248e-06, "loss": 0.0919, "step": 619 }, { "epoch": 0.793854033290653, "grad_norm": 0.2202102615631664, "learning_rate": 6.590920050209224e-06, "loss": 0.099, "step": 620 }, { "epoch": 0.795134443021767, "grad_norm": 0.20637789988728844, "learning_rate": 6.581383161329845e-06, "loss": 0.0975, "step": 621 }, { "epoch": 0.7964148527528809, "grad_norm": 0.22851991907665467, "learning_rate": 6.571839875475518e-06, "loss": 0.0975, "step": 622 }, { "epoch": 0.7976952624839949, "grad_norm": 0.2241608112123401, "learning_rate": 6.562290231250521e-06, "loss": 0.0858, "step": 623 }, { "epoch": 0.7989756722151088, "grad_norm": 0.24866374262064805, "learning_rate": 6.55273426728486e-06, "loss": 0.0903, "step": 624 }, { "epoch": 0.8002560819462228, "grad_norm": 0.1986158880247671, "learning_rate": 6.5431720222341e-06, "loss": 0.0905, "step": 625 }, { "epoch": 0.8015364916773368, "grad_norm": 0.20856308114619965, "learning_rate": 6.533603534779216e-06, "loss": 0.0828, "step": 626 }, { "epoch": 0.8028169014084507, "grad_norm": 0.19737477406082587, "learning_rate": 6.524028843626433e-06, "loss": 0.0817, "step": 627 }, { "epoch": 0.8040973111395646, "grad_norm": 0.2306700399218336, "learning_rate": 6.514447987507076e-06, "loss": 0.1072, "step": 628 }, { "epoch": 0.8053777208706786, "grad_norm": 0.22823387123809513, "learning_rate": 6.5048610051774e-06, "loss": 0.0997, "step": 629 }, { "epoch": 0.8066581306017926, "grad_norm": 0.20928115767376262, "learning_rate": 6.495267935418449e-06, "loss": 0.0895, "step": 630 }, { "epoch": 0.8079385403329066, "grad_norm": 0.2119094744744134, "learning_rate": 6.485668817035888e-06, "loss": 0.1047, "step": 631 }, { "epoch": 0.8092189500640204, "grad_norm": 0.23361430177640752, "learning_rate": 6.476063688859854e-06, "loss": 0.0868, "step": 632 }, { "epoch": 0.8104993597951344, "grad_norm": 0.2435003750419622, "learning_rate": 6.4664525897447864e-06, "loss": 0.0878, "step": 633 }, { "epoch": 0.8117797695262484, "grad_norm": 0.21743532273280491, "learning_rate": 6.456835558569287e-06, "loss": 0.1026, "step": 634 }, { "epoch": 0.8130601792573624, "grad_norm": 0.20318473038150192, "learning_rate": 6.447212634235948e-06, "loss": 0.0831, "step": 635 }, { "epoch": 0.8143405889884763, "grad_norm": 0.21022079327910895, "learning_rate": 6.437583855671205e-06, "loss": 0.0967, "step": 636 }, { "epoch": 0.8156209987195903, "grad_norm": 0.24610876390765762, "learning_rate": 6.427949261825172e-06, "loss": 0.1094, "step": 637 }, { "epoch": 0.8169014084507042, "grad_norm": 0.22223460249610175, "learning_rate": 6.418308891671484e-06, "loss": 0.1119, "step": 638 }, { "epoch": 0.8181818181818182, "grad_norm": 0.19643552726479147, "learning_rate": 6.408662784207149e-06, "loss": 0.0821, "step": 639 }, { "epoch": 0.8194622279129321, "grad_norm": 0.22027933120887905, "learning_rate": 6.39901097845238e-06, "loss": 0.092, "step": 640 }, { "epoch": 0.8207426376440461, "grad_norm": 0.2135807082588209, "learning_rate": 6.389353513450438e-06, "loss": 0.0958, "step": 641 }, { "epoch": 0.8220230473751601, "grad_norm": 0.21353508283579709, "learning_rate": 6.379690428267482e-06, "loss": 0.0939, "step": 642 }, { "epoch": 0.8233034571062741, "grad_norm": 0.21237060161707505, "learning_rate": 6.370021761992401e-06, "loss": 0.0898, "step": 643 }, { "epoch": 0.8245838668373879, "grad_norm": 0.21881713059522825, "learning_rate": 6.360347553736664e-06, "loss": 0.0909, "step": 644 }, { "epoch": 0.8258642765685019, "grad_norm": 0.25262631494896826, "learning_rate": 6.350667842634155e-06, "loss": 0.0876, "step": 645 }, { "epoch": 0.8271446862996159, "grad_norm": 0.245901825892452, "learning_rate": 6.340982667841021e-06, "loss": 0.1136, "step": 646 }, { "epoch": 0.8284250960307298, "grad_norm": 0.3636289426395242, "learning_rate": 6.33129206853551e-06, "loss": 0.1009, "step": 647 }, { "epoch": 0.8297055057618438, "grad_norm": 0.24359498396069168, "learning_rate": 6.321596083917809e-06, "loss": 0.1085, "step": 648 }, { "epoch": 0.8309859154929577, "grad_norm": 0.22746871247428258, "learning_rate": 6.311894753209896e-06, "loss": 0.1048, "step": 649 }, { "epoch": 0.8322663252240717, "grad_norm": 0.20892694689612523, "learning_rate": 6.302188115655371e-06, "loss": 0.09, "step": 650 }, { "epoch": 0.8335467349551856, "grad_norm": 0.2314395700682405, "learning_rate": 6.2924762105193026e-06, "loss": 0.0964, "step": 651 }, { "epoch": 0.8348271446862996, "grad_norm": 0.20972910477026893, "learning_rate": 6.282759077088067e-06, "loss": 0.0835, "step": 652 }, { "epoch": 0.8361075544174136, "grad_norm": 0.1994367397974724, "learning_rate": 6.273036754669187e-06, "loss": 0.0819, "step": 653 }, { "epoch": 0.8373879641485276, "grad_norm": 0.20354184469505698, "learning_rate": 6.26330928259118e-06, "loss": 0.0894, "step": 654 }, { "epoch": 0.8386683738796414, "grad_norm": 0.21989159610684933, "learning_rate": 6.2535767002033975e-06, "loss": 0.099, "step": 655 }, { "epoch": 0.8399487836107554, "grad_norm": 0.19835745495534965, "learning_rate": 6.243839046875854e-06, "loss": 0.0811, "step": 656 }, { "epoch": 0.8412291933418694, "grad_norm": 0.20622957052471386, "learning_rate": 6.234096361999082e-06, "loss": 0.0918, "step": 657 }, { "epoch": 0.8425096030729834, "grad_norm": 0.260000708022324, "learning_rate": 6.224348684983969e-06, "loss": 0.0879, "step": 658 }, { "epoch": 0.8437900128040973, "grad_norm": 0.2074456197617979, "learning_rate": 6.214596055261595e-06, "loss": 0.0937, "step": 659 }, { "epoch": 0.8450704225352113, "grad_norm": 0.19363941213309677, "learning_rate": 6.204838512283073e-06, "loss": 0.0881, "step": 660 }, { "epoch": 0.8463508322663252, "grad_norm": 0.19141473164660533, "learning_rate": 6.1950760955193925e-06, "loss": 0.0817, "step": 661 }, { "epoch": 0.8476312419974392, "grad_norm": 0.20423772881922167, "learning_rate": 6.18530884446126e-06, "loss": 0.0955, "step": 662 }, { "epoch": 0.8489116517285531, "grad_norm": 0.20432023031069352, "learning_rate": 6.175536798618935e-06, "loss": 0.0893, "step": 663 }, { "epoch": 0.8501920614596671, "grad_norm": 0.23056482497196074, "learning_rate": 6.16575999752207e-06, "loss": 0.1003, "step": 664 }, { "epoch": 0.8514724711907811, "grad_norm": 0.22842563449354267, "learning_rate": 6.155978480719564e-06, "loss": 0.1002, "step": 665 }, { "epoch": 0.852752880921895, "grad_norm": 0.21889163873623405, "learning_rate": 6.146192287779378e-06, "loss": 0.1016, "step": 666 }, { "epoch": 0.8540332906530089, "grad_norm": 0.24637116287327893, "learning_rate": 6.1364014582884005e-06, "loss": 0.1082, "step": 667 }, { "epoch": 0.8553137003841229, "grad_norm": 0.20709273221602, "learning_rate": 6.126606031852267e-06, "loss": 0.0862, "step": 668 }, { "epoch": 0.8565941101152369, "grad_norm": 0.21674147843079966, "learning_rate": 6.116806048095214e-06, "loss": 0.116, "step": 669 }, { "epoch": 0.8578745198463509, "grad_norm": 0.20648239547731237, "learning_rate": 6.107001546659911e-06, "loss": 0.0767, "step": 670 }, { "epoch": 0.8591549295774648, "grad_norm": 0.22510294161102934, "learning_rate": 6.097192567207304e-06, "loss": 0.0897, "step": 671 }, { "epoch": 0.8604353393085787, "grad_norm": 0.23847351777891543, "learning_rate": 6.0873791494164505e-06, "loss": 0.0803, "step": 672 }, { "epoch": 0.8617157490396927, "grad_norm": 0.2382601380316642, "learning_rate": 6.077561332984363e-06, "loss": 0.0807, "step": 673 }, { "epoch": 0.8629961587708067, "grad_norm": 0.2212974026882155, "learning_rate": 6.067739157625848e-06, "loss": 0.0851, "step": 674 }, { "epoch": 0.8642765685019206, "grad_norm": 0.22824639032445668, "learning_rate": 6.057912663073346e-06, "loss": 0.0845, "step": 675 }, { "epoch": 0.8655569782330346, "grad_norm": 0.22517922136060844, "learning_rate": 6.048081889076767e-06, "loss": 0.0958, "step": 676 }, { "epoch": 0.8668373879641486, "grad_norm": 0.2169438505385251, "learning_rate": 6.038246875403331e-06, "loss": 0.0993, "step": 677 }, { "epoch": 0.8681177976952625, "grad_norm": 0.20741842095462146, "learning_rate": 6.028407661837412e-06, "loss": 0.0855, "step": 678 }, { "epoch": 0.8693982074263764, "grad_norm": 0.23680415005976288, "learning_rate": 6.018564288180372e-06, "loss": 0.1158, "step": 679 }, { "epoch": 0.8706786171574904, "grad_norm": 0.21055437955663672, "learning_rate": 6.008716794250398e-06, "loss": 0.1011, "step": 680 }, { "epoch": 0.8719590268886044, "grad_norm": 0.22340767421558255, "learning_rate": 5.998865219882348e-06, "loss": 0.1044, "step": 681 }, { "epoch": 0.8732394366197183, "grad_norm": 0.23712051274458523, "learning_rate": 5.989009604927587e-06, "loss": 0.0975, "step": 682 }, { "epoch": 0.8745198463508322, "grad_norm": 0.20357467803403767, "learning_rate": 5.979149989253821e-06, "loss": 0.0928, "step": 683 }, { "epoch": 0.8758002560819462, "grad_norm": 0.19572112709374134, "learning_rate": 5.969286412744943e-06, "loss": 0.0821, "step": 684 }, { "epoch": 0.8770806658130602, "grad_norm": 0.20216923972716555, "learning_rate": 5.959418915300863e-06, "loss": 0.0869, "step": 685 }, { "epoch": 0.8783610755441741, "grad_norm": 0.23761541121731025, "learning_rate": 5.94954753683736e-06, "loss": 0.0874, "step": 686 }, { "epoch": 0.8796414852752881, "grad_norm": 0.21661633569170327, "learning_rate": 5.939672317285907e-06, "loss": 0.1032, "step": 687 }, { "epoch": 0.8809218950064021, "grad_norm": 0.2201771959516145, "learning_rate": 5.929793296593515e-06, "loss": 0.0988, "step": 688 }, { "epoch": 0.882202304737516, "grad_norm": 0.20971115697401213, "learning_rate": 5.919910514722572e-06, "loss": 0.0918, "step": 689 }, { "epoch": 0.8834827144686299, "grad_norm": 0.19389279752861957, "learning_rate": 5.910024011650682e-06, "loss": 0.0797, "step": 690 }, { "epoch": 0.8847631241997439, "grad_norm": 0.2082382668762539, "learning_rate": 5.900133827370501e-06, "loss": 0.0952, "step": 691 }, { "epoch": 0.8860435339308579, "grad_norm": 0.19060694370887094, "learning_rate": 5.890240001889576e-06, "loss": 0.0754, "step": 692 }, { "epoch": 0.8873239436619719, "grad_norm": 0.20131435146773274, "learning_rate": 5.8803425752301814e-06, "loss": 0.0886, "step": 693 }, { "epoch": 0.8886043533930857, "grad_norm": 0.21011608104116342, "learning_rate": 5.870441587429164e-06, "loss": 0.0965, "step": 694 }, { "epoch": 0.8898847631241997, "grad_norm": 0.22919848824366482, "learning_rate": 5.860537078537768e-06, "loss": 0.1097, "step": 695 }, { "epoch": 0.8911651728553137, "grad_norm": 0.1974688343151552, "learning_rate": 5.850629088621491e-06, "loss": 0.0824, "step": 696 }, { "epoch": 0.8924455825864277, "grad_norm": 0.20747052362475285, "learning_rate": 5.840717657759904e-06, "loss": 0.0874, "step": 697 }, { "epoch": 0.8937259923175416, "grad_norm": 0.22059010243728006, "learning_rate": 5.830802826046503e-06, "loss": 0.104, "step": 698 }, { "epoch": 0.8950064020486556, "grad_norm": 0.21124304839690355, "learning_rate": 5.820884633588536e-06, "loss": 0.0983, "step": 699 }, { "epoch": 0.8962868117797695, "grad_norm": 0.2065261464519528, "learning_rate": 5.8109631205068476e-06, "loss": 0.0887, "step": 700 }, { "epoch": 0.8975672215108835, "grad_norm": 0.19187873431684702, "learning_rate": 5.801038326935714e-06, "loss": 0.0709, "step": 701 }, { "epoch": 0.8988476312419974, "grad_norm": 0.22969242402776796, "learning_rate": 5.791110293022687e-06, "loss": 0.1059, "step": 702 }, { "epoch": 0.9001280409731114, "grad_norm": 0.23032223458522574, "learning_rate": 5.781179058928419e-06, "loss": 0.0964, "step": 703 }, { "epoch": 0.9014084507042254, "grad_norm": 0.21933881118957108, "learning_rate": 5.771244664826512e-06, "loss": 0.1005, "step": 704 }, { "epoch": 0.9026888604353394, "grad_norm": 0.21557992777730176, "learning_rate": 5.761307150903349e-06, "loss": 0.0923, "step": 705 }, { "epoch": 0.9039692701664532, "grad_norm": 0.22630128788020043, "learning_rate": 5.7513665573579335e-06, "loss": 0.1083, "step": 706 }, { "epoch": 0.9052496798975672, "grad_norm": 0.22739568146340777, "learning_rate": 5.741422924401727e-06, "loss": 0.0943, "step": 707 }, { "epoch": 0.9065300896286812, "grad_norm": 0.20230819112215423, "learning_rate": 5.7314762922584866e-06, "loss": 0.0877, "step": 708 }, { "epoch": 0.9078104993597952, "grad_norm": 0.20626024746969185, "learning_rate": 5.7215267011641e-06, "loss": 0.0889, "step": 709 }, { "epoch": 0.9090909090909091, "grad_norm": 0.21673261257396034, "learning_rate": 5.711574191366427e-06, "loss": 0.0951, "step": 710 }, { "epoch": 0.910371318822023, "grad_norm": 0.685602427502706, "learning_rate": 5.701618803125128e-06, "loss": 0.0952, "step": 711 }, { "epoch": 0.911651728553137, "grad_norm": 0.20084632595233004, "learning_rate": 5.691660576711516e-06, "loss": 0.085, "step": 712 }, { "epoch": 0.912932138284251, "grad_norm": 0.21294564249577722, "learning_rate": 5.6816995524083785e-06, "loss": 0.0898, "step": 713 }, { "epoch": 0.9142125480153649, "grad_norm": 0.20382179876673195, "learning_rate": 5.671735770509824e-06, "loss": 0.0807, "step": 714 }, { "epoch": 0.9154929577464789, "grad_norm": 0.20214195449102987, "learning_rate": 5.661769271321113e-06, "loss": 0.0758, "step": 715 }, { "epoch": 0.9167733674775929, "grad_norm": 0.21897570195630386, "learning_rate": 5.651800095158502e-06, "loss": 0.0828, "step": 716 }, { "epoch": 0.9180537772087067, "grad_norm": 0.22919020752131444, "learning_rate": 5.641828282349071e-06, "loss": 0.0826, "step": 717 }, { "epoch": 0.9193341869398207, "grad_norm": 0.21465620283057857, "learning_rate": 5.631853873230572e-06, "loss": 0.0771, "step": 718 }, { "epoch": 0.9206145966709347, "grad_norm": 0.21735275524042424, "learning_rate": 5.621876908151254e-06, "loss": 0.109, "step": 719 }, { "epoch": 0.9218950064020487, "grad_norm": 0.23511394969364702, "learning_rate": 5.611897427469709e-06, "loss": 0.1, "step": 720 }, { "epoch": 0.9231754161331626, "grad_norm": 0.22462420963730104, "learning_rate": 5.6019154715547045e-06, "loss": 0.1098, "step": 721 }, { "epoch": 0.9244558258642765, "grad_norm": 0.1877139488883673, "learning_rate": 5.591931080785018e-06, "loss": 0.0636, "step": 722 }, { "epoch": 0.9257362355953905, "grad_norm": 0.23021068702465386, "learning_rate": 5.581944295549279e-06, "loss": 0.0876, "step": 723 }, { "epoch": 0.9270166453265045, "grad_norm": 0.24986957292091488, "learning_rate": 5.571955156245803e-06, "loss": 0.0808, "step": 724 }, { "epoch": 0.9282970550576184, "grad_norm": 0.23170643657478066, "learning_rate": 5.561963703282429e-06, "loss": 0.1199, "step": 725 }, { "epoch": 0.9295774647887324, "grad_norm": 0.21286889151145547, "learning_rate": 5.55196997707635e-06, "loss": 0.0777, "step": 726 }, { "epoch": 0.9308578745198464, "grad_norm": 0.26163080216231355, "learning_rate": 5.541974018053959e-06, "loss": 0.0971, "step": 727 }, { "epoch": 0.9321382842509603, "grad_norm": 0.20786337449967132, "learning_rate": 5.531975866650684e-06, "loss": 0.0889, "step": 728 }, { "epoch": 0.9334186939820742, "grad_norm": 0.20019303261840407, "learning_rate": 5.521975563310817e-06, "loss": 0.0752, "step": 729 }, { "epoch": 0.9346991037131882, "grad_norm": 0.24852998443689264, "learning_rate": 5.511973148487354e-06, "loss": 0.0819, "step": 730 }, { "epoch": 0.9359795134443022, "grad_norm": 0.2055268487883145, "learning_rate": 5.501968662641834e-06, "loss": 0.1003, "step": 731 }, { "epoch": 0.9372599231754162, "grad_norm": 0.20581762869127163, "learning_rate": 5.4919621462441765e-06, "loss": 0.0917, "step": 732 }, { "epoch": 0.93854033290653, "grad_norm": 0.1990808342337929, "learning_rate": 5.481953639772513e-06, "loss": 0.0779, "step": 733 }, { "epoch": 0.939820742637644, "grad_norm": 0.22001537369420976, "learning_rate": 5.471943183713021e-06, "loss": 0.1109, "step": 734 }, { "epoch": 0.941101152368758, "grad_norm": 0.18924762727669558, "learning_rate": 5.461930818559771e-06, "loss": 0.0777, "step": 735 }, { "epoch": 0.942381562099872, "grad_norm": 0.21809683246750056, "learning_rate": 5.451916584814552e-06, "loss": 0.0929, "step": 736 }, { "epoch": 0.9436619718309859, "grad_norm": 0.21268936707290262, "learning_rate": 5.441900522986712e-06, "loss": 0.1041, "step": 737 }, { "epoch": 0.9449423815620999, "grad_norm": 0.23405010206761892, "learning_rate": 5.431882673592998e-06, "loss": 0.0858, "step": 738 }, { "epoch": 0.9462227912932138, "grad_norm": 0.22881618024029346, "learning_rate": 5.421863077157384e-06, "loss": 0.0934, "step": 739 }, { "epoch": 0.9475032010243278, "grad_norm": 0.22409683026254149, "learning_rate": 5.411841774210911e-06, "loss": 0.0977, "step": 740 }, { "epoch": 0.9487836107554417, "grad_norm": 0.20785151967456247, "learning_rate": 5.401818805291524e-06, "loss": 0.0903, "step": 741 }, { "epoch": 0.9500640204865557, "grad_norm": 0.2185029103225065, "learning_rate": 5.391794210943908e-06, "loss": 0.102, "step": 742 }, { "epoch": 0.9513444302176697, "grad_norm": 0.21873989610229555, "learning_rate": 5.381768031719322e-06, "loss": 0.107, "step": 743 }, { "epoch": 0.9526248399487837, "grad_norm": 0.20095639281905325, "learning_rate": 5.371740308175437e-06, "loss": 0.0859, "step": 744 }, { "epoch": 0.9539052496798975, "grad_norm": 0.20360054641929634, "learning_rate": 5.361711080876172e-06, "loss": 0.0781, "step": 745 }, { "epoch": 0.9551856594110115, "grad_norm": 0.22823079213018804, "learning_rate": 5.351680390391525e-06, "loss": 0.1105, "step": 746 }, { "epoch": 0.9564660691421255, "grad_norm": 0.2354888891882964, "learning_rate": 5.3416482772974165e-06, "loss": 0.091, "step": 747 }, { "epoch": 0.9577464788732394, "grad_norm": 0.22613884719083754, "learning_rate": 5.33161478217552e-06, "loss": 0.0963, "step": 748 }, { "epoch": 0.9590268886043534, "grad_norm": 0.2216976559273003, "learning_rate": 5.321579945613102e-06, "loss": 0.0971, "step": 749 }, { "epoch": 0.9603072983354674, "grad_norm": 0.21285642694377846, "learning_rate": 5.311543808202853e-06, "loss": 0.089, "step": 750 }, { "epoch": 0.9615877080665813, "grad_norm": 0.20156107708336243, "learning_rate": 5.301506410542725e-06, "loss": 0.0908, "step": 751 }, { "epoch": 0.9628681177976952, "grad_norm": 0.19290678335395103, "learning_rate": 5.2914677932357695e-06, "loss": 0.066, "step": 752 }, { "epoch": 0.9641485275288092, "grad_norm": 0.21071305556945374, "learning_rate": 5.281427996889972e-06, "loss": 0.0877, "step": 753 }, { "epoch": 0.9654289372599232, "grad_norm": 0.19569363521586788, "learning_rate": 5.2713870621180865e-06, "loss": 0.0813, "step": 754 }, { "epoch": 0.9667093469910372, "grad_norm": 0.19973452327544078, "learning_rate": 5.261345029537473e-06, "loss": 0.0826, "step": 755 }, { "epoch": 0.967989756722151, "grad_norm": 0.21753656088111517, "learning_rate": 5.2513019397699305e-06, "loss": 0.0787, "step": 756 }, { "epoch": 0.969270166453265, "grad_norm": 0.2111869622944084, "learning_rate": 5.241257833441535e-06, "loss": 0.0948, "step": 757 }, { "epoch": 0.970550576184379, "grad_norm": 0.2103275213831526, "learning_rate": 5.231212751182477e-06, "loss": 0.0993, "step": 758 }, { "epoch": 0.971830985915493, "grad_norm": 0.1999648844724173, "learning_rate": 5.221166733626895e-06, "loss": 0.0759, "step": 759 }, { "epoch": 0.9731113956466069, "grad_norm": 0.21526613604092565, "learning_rate": 5.211119821412706e-06, "loss": 0.0795, "step": 760 }, { "epoch": 0.9743918053777209, "grad_norm": 0.21717805344759095, "learning_rate": 5.20107205518145e-06, "loss": 0.1046, "step": 761 }, { "epoch": 0.9756722151088348, "grad_norm": 0.2148134503209442, "learning_rate": 5.191023475578122e-06, "loss": 0.0757, "step": 762 }, { "epoch": 0.9769526248399488, "grad_norm": 0.20109736585054971, "learning_rate": 5.180974123251003e-06, "loss": 0.0859, "step": 763 }, { "epoch": 0.9782330345710627, "grad_norm": 0.21590590380969116, "learning_rate": 5.170924038851507e-06, "loss": 0.1039, "step": 764 }, { "epoch": 0.9795134443021767, "grad_norm": 0.21875806831737962, "learning_rate": 5.160873263034003e-06, "loss": 0.0784, "step": 765 }, { "epoch": 0.9807938540332907, "grad_norm": 0.22217291386780197, "learning_rate": 5.1508218364556596e-06, "loss": 0.0981, "step": 766 }, { "epoch": 0.9820742637644047, "grad_norm": 0.2248780582602742, "learning_rate": 5.140769799776277e-06, "loss": 0.1092, "step": 767 }, { "epoch": 0.9833546734955185, "grad_norm": 0.2919981047680063, "learning_rate": 5.130717193658125e-06, "loss": 0.1001, "step": 768 }, { "epoch": 0.9846350832266325, "grad_norm": 0.22208453751490922, "learning_rate": 5.120664058765773e-06, "loss": 0.1051, "step": 769 }, { "epoch": 0.9859154929577465, "grad_norm": 0.1890084345200068, "learning_rate": 5.110610435765935e-06, "loss": 0.0778, "step": 770 }, { "epoch": 0.9871959026888605, "grad_norm": 0.21214316665420105, "learning_rate": 5.100556365327295e-06, "loss": 0.0914, "step": 771 }, { "epoch": 0.9884763124199744, "grad_norm": 0.1983827463135624, "learning_rate": 5.0905018881203505e-06, "loss": 0.0871, "step": 772 }, { "epoch": 0.9897567221510883, "grad_norm": 0.19637228440646418, "learning_rate": 5.080447044817238e-06, "loss": 0.0677, "step": 773 }, { "epoch": 0.9910371318822023, "grad_norm": 0.20791215134817592, "learning_rate": 5.070391876091582e-06, "loss": 0.0863, "step": 774 }, { "epoch": 0.9923175416133163, "grad_norm": 0.2830801849730667, "learning_rate": 5.060336422618323e-06, "loss": 0.0991, "step": 775 }, { "epoch": 0.9935979513444302, "grad_norm": 0.2038888069704103, "learning_rate": 5.05028072507355e-06, "loss": 0.0818, "step": 776 }, { "epoch": 0.9948783610755442, "grad_norm": 0.2087054609393056, "learning_rate": 5.040224824134337e-06, "loss": 0.0982, "step": 777 }, { "epoch": 0.9961587708066582, "grad_norm": 0.2044598674309634, "learning_rate": 5.030168760478586e-06, "loss": 0.0935, "step": 778 }, { "epoch": 0.9974391805377721, "grad_norm": 0.23453450649130742, "learning_rate": 5.020112574784857e-06, "loss": 0.0874, "step": 779 }, { "epoch": 0.998719590268886, "grad_norm": 0.2045323699868877, "learning_rate": 5.010056307732202e-06, "loss": 0.0944, "step": 780 }, { "epoch": 1.0, "grad_norm": 0.1690757502548154, "learning_rate": 5e-06, "loss": 0.0527, "step": 781 }, { "epoch": 1.0012804097311139, "grad_norm": 0.18450052023462973, "learning_rate": 4.9899436922678004e-06, "loss": 0.0736, "step": 782 }, { "epoch": 1.002560819462228, "grad_norm": 0.17340800605765952, "learning_rate": 4.979887425215144e-06, "loss": 0.0589, "step": 783 }, { "epoch": 1.0038412291933418, "grad_norm": 0.18018909322693144, "learning_rate": 4.969831239521415e-06, "loss": 0.0728, "step": 784 }, { "epoch": 1.0051216389244557, "grad_norm": 0.17795118760693907, "learning_rate": 4.959775175865666e-06, "loss": 0.0681, "step": 785 }, { "epoch": 1.0064020486555698, "grad_norm": 0.17063515092015494, "learning_rate": 4.949719274926452e-06, "loss": 0.0591, "step": 786 }, { "epoch": 1.0076824583866837, "grad_norm": 0.17971341543913885, "learning_rate": 4.939663577381679e-06, "loss": 0.07, "step": 787 }, { "epoch": 1.0089628681177978, "grad_norm": 0.18547302186224024, "learning_rate": 4.9296081239084176e-06, "loss": 0.0693, "step": 788 }, { "epoch": 1.0102432778489117, "grad_norm": 0.17869341802096425, "learning_rate": 4.919552955182763e-06, "loss": 0.0585, "step": 789 }, { "epoch": 1.0115236875800255, "grad_norm": 0.177939444032322, "learning_rate": 4.909498111879653e-06, "loss": 0.056, "step": 790 }, { "epoch": 1.0128040973111396, "grad_norm": 0.18461685969037228, "learning_rate": 4.899443634672706e-06, "loss": 0.0585, "step": 791 }, { "epoch": 1.0140845070422535, "grad_norm": 0.2031038464574364, "learning_rate": 4.8893895642340665e-06, "loss": 0.0731, "step": 792 }, { "epoch": 1.0153649167733674, "grad_norm": 0.18485510103212133, "learning_rate": 4.879335941234228e-06, "loss": 0.0576, "step": 793 }, { "epoch": 1.0166453265044815, "grad_norm": 0.19262452212924525, "learning_rate": 4.8692828063418775e-06, "loss": 0.0606, "step": 794 }, { "epoch": 1.0179257362355953, "grad_norm": 0.21597558495659974, "learning_rate": 4.8592302002237245e-06, "loss": 0.0729, "step": 795 }, { "epoch": 1.0192061459667094, "grad_norm": 0.20114394972838007, "learning_rate": 4.84917816354434e-06, "loss": 0.0655, "step": 796 }, { "epoch": 1.0204865556978233, "grad_norm": 0.1943093487531922, "learning_rate": 4.839126736965998e-06, "loss": 0.0678, "step": 797 }, { "epoch": 1.0217669654289372, "grad_norm": 0.21771441467965788, "learning_rate": 4.829075961148495e-06, "loss": 0.0596, "step": 798 }, { "epoch": 1.0230473751600513, "grad_norm": 0.19369079966007602, "learning_rate": 4.8190258767489976e-06, "loss": 0.0664, "step": 799 }, { "epoch": 1.0243277848911652, "grad_norm": 0.20996164318714716, "learning_rate": 4.80897652442188e-06, "loss": 0.0884, "step": 800 }, { "epoch": 1.0243277848911652, "eval_loss": 0.08849632740020752, "eval_runtime": 10.9256, "eval_samples_per_second": 23.157, "eval_steps_per_second": 5.858, "step": 800 }, { "epoch": 1.025608194622279, "grad_norm": 0.20204094004612355, "learning_rate": 4.798927944818552e-06, "loss": 0.0753, "step": 801 }, { "epoch": 1.0268886043533931, "grad_norm": 0.1977099683784906, "learning_rate": 4.788880178587295e-06, "loss": 0.0595, "step": 802 }, { "epoch": 1.028169014084507, "grad_norm": 0.17845225588391359, "learning_rate": 4.778833266373107e-06, "loss": 0.0506, "step": 803 }, { "epoch": 1.029449423815621, "grad_norm": 0.20487635905457094, "learning_rate": 4.768787248817523e-06, "loss": 0.0818, "step": 804 }, { "epoch": 1.030729833546735, "grad_norm": 0.20620899591510972, "learning_rate": 4.7587421665584655e-06, "loss": 0.0741, "step": 805 }, { "epoch": 1.0320102432778488, "grad_norm": 0.19762934748756558, "learning_rate": 4.748698060230072e-06, "loss": 0.0731, "step": 806 }, { "epoch": 1.033290653008963, "grad_norm": 0.1909835933615634, "learning_rate": 4.738654970462528e-06, "loss": 0.0536, "step": 807 }, { "epoch": 1.0345710627400768, "grad_norm": 0.21653112643914008, "learning_rate": 4.728612937881915e-06, "loss": 0.078, "step": 808 }, { "epoch": 1.0358514724711907, "grad_norm": 0.2142425072446757, "learning_rate": 4.71857200311003e-06, "loss": 0.0809, "step": 809 }, { "epoch": 1.0371318822023048, "grad_norm": 0.21087780090654004, "learning_rate": 4.708532206764232e-06, "loss": 0.073, "step": 810 }, { "epoch": 1.0384122919334187, "grad_norm": 0.2062353076029648, "learning_rate": 4.698493589457277e-06, "loss": 0.0813, "step": 811 }, { "epoch": 1.0396927016645328, "grad_norm": 0.20417345382781343, "learning_rate": 4.688456191797148e-06, "loss": 0.0745, "step": 812 }, { "epoch": 1.0409731113956466, "grad_norm": 0.1874332642916751, "learning_rate": 4.678420054386899e-06, "loss": 0.0568, "step": 813 }, { "epoch": 1.0422535211267605, "grad_norm": 0.19719361133712385, "learning_rate": 4.668385217824482e-06, "loss": 0.0704, "step": 814 }, { "epoch": 1.0435339308578746, "grad_norm": 0.20739016774037022, "learning_rate": 4.658351722702585e-06, "loss": 0.0745, "step": 815 }, { "epoch": 1.0448143405889885, "grad_norm": 0.20702775006463095, "learning_rate": 4.648319609608477e-06, "loss": 0.0802, "step": 816 }, { "epoch": 1.0460947503201024, "grad_norm": 0.21112282737055701, "learning_rate": 4.6382889191238306e-06, "loss": 0.0678, "step": 817 }, { "epoch": 1.0473751600512164, "grad_norm": 0.21060482978835185, "learning_rate": 4.6282596918245635e-06, "loss": 0.0856, "step": 818 }, { "epoch": 1.0486555697823303, "grad_norm": 0.18900786448680965, "learning_rate": 4.61823196828068e-06, "loss": 0.0642, "step": 819 }, { "epoch": 1.0499359795134442, "grad_norm": 0.20485663192778605, "learning_rate": 4.608205789056094e-06, "loss": 0.0748, "step": 820 }, { "epoch": 1.0512163892445583, "grad_norm": 0.19803319717883325, "learning_rate": 4.598181194708477e-06, "loss": 0.0689, "step": 821 }, { "epoch": 1.0524967989756722, "grad_norm": 0.21969961746836322, "learning_rate": 4.588158225789091e-06, "loss": 0.0774, "step": 822 }, { "epoch": 1.0537772087067863, "grad_norm": 0.21532761956940846, "learning_rate": 4.578136922842617e-06, "loss": 0.0699, "step": 823 }, { "epoch": 1.0550576184379001, "grad_norm": 0.20743640954016887, "learning_rate": 4.568117326407003e-06, "loss": 0.0756, "step": 824 }, { "epoch": 1.056338028169014, "grad_norm": 0.2026471831485644, "learning_rate": 4.558099477013288e-06, "loss": 0.0692, "step": 825 }, { "epoch": 1.057618437900128, "grad_norm": 0.19706955103865778, "learning_rate": 4.548083415185449e-06, "loss": 0.0729, "step": 826 }, { "epoch": 1.058898847631242, "grad_norm": 0.20787809940246563, "learning_rate": 4.5380691814402315e-06, "loss": 0.0677, "step": 827 }, { "epoch": 1.0601792573623559, "grad_norm": 0.20088731956181705, "learning_rate": 4.528056816286982e-06, "loss": 0.077, "step": 828 }, { "epoch": 1.06145966709347, "grad_norm": 0.21158171676711524, "learning_rate": 4.5180463602274896e-06, "loss": 0.0686, "step": 829 }, { "epoch": 1.0627400768245838, "grad_norm": 0.2088525376329303, "learning_rate": 4.508037853755825e-06, "loss": 0.0748, "step": 830 }, { "epoch": 1.064020486555698, "grad_norm": 0.21066776799440956, "learning_rate": 4.498031337358167e-06, "loss": 0.0727, "step": 831 }, { "epoch": 1.0653008962868118, "grad_norm": 0.18860332756512888, "learning_rate": 4.488026851512648e-06, "loss": 0.0573, "step": 832 }, { "epoch": 1.0665813060179257, "grad_norm": 0.19176806986897754, "learning_rate": 4.478024436689185e-06, "loss": 0.0638, "step": 833 }, { "epoch": 1.0678617157490398, "grad_norm": 0.20172418555213295, "learning_rate": 4.468024133349316e-06, "loss": 0.0715, "step": 834 }, { "epoch": 1.0691421254801536, "grad_norm": 0.2024531498771351, "learning_rate": 4.458025981946041e-06, "loss": 0.0681, "step": 835 }, { "epoch": 1.0704225352112675, "grad_norm": 0.21121312266540856, "learning_rate": 4.4480300229236525e-06, "loss": 0.0767, "step": 836 }, { "epoch": 1.0717029449423816, "grad_norm": 0.18968697408742455, "learning_rate": 4.4380362967175725e-06, "loss": 0.0626, "step": 837 }, { "epoch": 1.0729833546734955, "grad_norm": 0.21621725875235998, "learning_rate": 4.428044843754198e-06, "loss": 0.0625, "step": 838 }, { "epoch": 1.0742637644046096, "grad_norm": 0.19296966877130417, "learning_rate": 4.418055704450721e-06, "loss": 0.0636, "step": 839 }, { "epoch": 1.0755441741357235, "grad_norm": 0.20652371549446835, "learning_rate": 4.4080689192149835e-06, "loss": 0.0756, "step": 840 }, { "epoch": 1.0768245838668373, "grad_norm": 0.1988450656383578, "learning_rate": 4.398084528445297e-06, "loss": 0.0739, "step": 841 }, { "epoch": 1.0781049935979514, "grad_norm": 0.1999686727041779, "learning_rate": 4.388102572530291e-06, "loss": 0.0727, "step": 842 }, { "epoch": 1.0793854033290653, "grad_norm": 0.22544286033199262, "learning_rate": 4.378123091848747e-06, "loss": 0.0937, "step": 843 }, { "epoch": 1.0806658130601792, "grad_norm": 0.20724468869415336, "learning_rate": 4.368146126769432e-06, "loss": 0.0777, "step": 844 }, { "epoch": 1.0819462227912933, "grad_norm": 0.24072862499869555, "learning_rate": 4.35817171765093e-06, "loss": 0.0599, "step": 845 }, { "epoch": 1.0832266325224071, "grad_norm": 0.2131640608306155, "learning_rate": 4.3481999048415e-06, "loss": 0.0737, "step": 846 }, { "epoch": 1.084507042253521, "grad_norm": 0.19490346302240336, "learning_rate": 4.338230728678888e-06, "loss": 0.0631, "step": 847 }, { "epoch": 1.085787451984635, "grad_norm": 0.2165341861283, "learning_rate": 4.328264229490177e-06, "loss": 0.0872, "step": 848 }, { "epoch": 1.087067861715749, "grad_norm": 0.20659124360042125, "learning_rate": 4.318300447591623e-06, "loss": 0.0711, "step": 849 }, { "epoch": 1.088348271446863, "grad_norm": 0.2127425434387985, "learning_rate": 4.3083394232884845e-06, "loss": 0.084, "step": 850 }, { "epoch": 1.089628681177977, "grad_norm": 0.20682512194566957, "learning_rate": 4.2983811968748726e-06, "loss": 0.0738, "step": 851 }, { "epoch": 1.0909090909090908, "grad_norm": 0.206087855853219, "learning_rate": 4.2884258086335755e-06, "loss": 0.0685, "step": 852 }, { "epoch": 1.092189500640205, "grad_norm": 0.23450258590283424, "learning_rate": 4.2784732988359005e-06, "loss": 0.0635, "step": 853 }, { "epoch": 1.0934699103713188, "grad_norm": 0.20121888836959512, "learning_rate": 4.268523707741514e-06, "loss": 0.0675, "step": 854 }, { "epoch": 1.094750320102433, "grad_norm": 0.20396239153209025, "learning_rate": 4.258577075598275e-06, "loss": 0.0818, "step": 855 }, { "epoch": 1.0960307298335468, "grad_norm": 0.19632960057971455, "learning_rate": 4.248633442642067e-06, "loss": 0.0597, "step": 856 }, { "epoch": 1.0973111395646606, "grad_norm": 0.20420278053182667, "learning_rate": 4.238692849096652e-06, "loss": 0.0698, "step": 857 }, { "epoch": 1.0985915492957747, "grad_norm": 0.2226761781033323, "learning_rate": 4.228755335173488e-06, "loss": 0.0743, "step": 858 }, { "epoch": 1.0998719590268886, "grad_norm": 0.19401515541492712, "learning_rate": 4.218820941071582e-06, "loss": 0.0604, "step": 859 }, { "epoch": 1.1011523687580025, "grad_norm": 0.19620931946246928, "learning_rate": 4.208889706977315e-06, "loss": 0.0606, "step": 860 }, { "epoch": 1.1024327784891166, "grad_norm": 0.19302640560891826, "learning_rate": 4.1989616730642866e-06, "loss": 0.0686, "step": 861 }, { "epoch": 1.1037131882202305, "grad_norm": 0.20890717603013675, "learning_rate": 4.189036879493155e-06, "loss": 0.07, "step": 862 }, { "epoch": 1.1049935979513443, "grad_norm": 0.18715067750349357, "learning_rate": 4.179115366411467e-06, "loss": 0.0604, "step": 863 }, { "epoch": 1.1062740076824584, "grad_norm": 0.21283034574927892, "learning_rate": 4.169197173953498e-06, "loss": 0.0678, "step": 864 }, { "epoch": 1.1075544174135723, "grad_norm": 0.20926296174922984, "learning_rate": 4.159282342240096e-06, "loss": 0.0744, "step": 865 }, { "epoch": 1.1088348271446864, "grad_norm": 0.1970163881380724, "learning_rate": 4.149370911378509e-06, "loss": 0.0591, "step": 866 }, { "epoch": 1.1101152368758003, "grad_norm": 0.20759202054256357, "learning_rate": 4.139462921462233e-06, "loss": 0.0767, "step": 867 }, { "epoch": 1.1113956466069141, "grad_norm": 0.1997400062913058, "learning_rate": 4.129558412570839e-06, "loss": 0.0711, "step": 868 }, { "epoch": 1.1126760563380282, "grad_norm": 0.1810795091326489, "learning_rate": 4.119657424769819e-06, "loss": 0.0484, "step": 869 }, { "epoch": 1.1139564660691421, "grad_norm": 0.2032080253374902, "learning_rate": 4.109759998110426e-06, "loss": 0.0724, "step": 870 }, { "epoch": 1.115236875800256, "grad_norm": 0.19115302596204736, "learning_rate": 4.099866172629501e-06, "loss": 0.0599, "step": 871 }, { "epoch": 1.11651728553137, "grad_norm": 0.18132758709502073, "learning_rate": 4.089975988349319e-06, "loss": 0.0515, "step": 872 }, { "epoch": 1.117797695262484, "grad_norm": 0.18673863621897682, "learning_rate": 4.0800894852774295e-06, "loss": 0.0519, "step": 873 }, { "epoch": 1.119078104993598, "grad_norm": 0.22867315026955162, "learning_rate": 4.070206703406486e-06, "loss": 0.0827, "step": 874 }, { "epoch": 1.120358514724712, "grad_norm": 0.19953234933576317, "learning_rate": 4.060327682714095e-06, "loss": 0.0672, "step": 875 }, { "epoch": 1.1216389244558258, "grad_norm": 0.18667178624118597, "learning_rate": 4.050452463162643e-06, "loss": 0.0541, "step": 876 }, { "epoch": 1.12291933418694, "grad_norm": 0.20666599767334667, "learning_rate": 4.040581084699138e-06, "loss": 0.0773, "step": 877 }, { "epoch": 1.1241997439180538, "grad_norm": 0.19665304045259357, "learning_rate": 4.03071358725506e-06, "loss": 0.0668, "step": 878 }, { "epoch": 1.1254801536491676, "grad_norm": 0.1950210779797977, "learning_rate": 4.0208500107461814e-06, "loss": 0.0626, "step": 879 }, { "epoch": 1.1267605633802817, "grad_norm": 0.21815336419329803, "learning_rate": 4.010990395072414e-06, "loss": 0.0808, "step": 880 }, { "epoch": 1.1280409731113956, "grad_norm": 0.2218954203590765, "learning_rate": 4.001134780117654e-06, "loss": 0.0888, "step": 881 }, { "epoch": 1.1293213828425097, "grad_norm": 0.19165216797620277, "learning_rate": 3.991283205749603e-06, "loss": 0.0564, "step": 882 }, { "epoch": 1.1306017925736236, "grad_norm": 0.20162673717443436, "learning_rate": 3.98143571181963e-06, "loss": 0.0596, "step": 883 }, { "epoch": 1.1318822023047375, "grad_norm": 0.2231499745813081, "learning_rate": 3.971592338162589e-06, "loss": 0.086, "step": 884 }, { "epoch": 1.1331626120358516, "grad_norm": 0.2005534651215062, "learning_rate": 3.961753124596669e-06, "loss": 0.0773, "step": 885 }, { "epoch": 1.1344430217669654, "grad_norm": 0.19833313189802676, "learning_rate": 3.9519181109232355e-06, "loss": 0.0581, "step": 886 }, { "epoch": 1.1357234314980793, "grad_norm": 0.1967085012479549, "learning_rate": 3.9420873369266555e-06, "loss": 0.0695, "step": 887 }, { "epoch": 1.1370038412291934, "grad_norm": 0.20042025328178645, "learning_rate": 3.932260842374153e-06, "loss": 0.0587, "step": 888 }, { "epoch": 1.1382842509603073, "grad_norm": 0.19707584213888377, "learning_rate": 3.922438667015639e-06, "loss": 0.0647, "step": 889 }, { "epoch": 1.1395646606914211, "grad_norm": 0.20801133343225225, "learning_rate": 3.912620850583553e-06, "loss": 0.0775, "step": 890 }, { "epoch": 1.1408450704225352, "grad_norm": 0.20114691644341573, "learning_rate": 3.902807432792698e-06, "loss": 0.0683, "step": 891 }, { "epoch": 1.1421254801536491, "grad_norm": 0.2134197066790879, "learning_rate": 3.892998453340091e-06, "loss": 0.0814, "step": 892 }, { "epoch": 1.1434058898847632, "grad_norm": 0.19438882608109453, "learning_rate": 3.883193951904787e-06, "loss": 0.0691, "step": 893 }, { "epoch": 1.144686299615877, "grad_norm": 0.19907060029141857, "learning_rate": 3.873393968147735e-06, "loss": 0.0647, "step": 894 }, { "epoch": 1.145966709346991, "grad_norm": 0.20354385274664547, "learning_rate": 3.863598541711602e-06, "loss": 0.066, "step": 895 }, { "epoch": 1.147247119078105, "grad_norm": 0.19573886429428136, "learning_rate": 3.853807712220623e-06, "loss": 0.0725, "step": 896 }, { "epoch": 1.148527528809219, "grad_norm": 0.21239262468885894, "learning_rate": 3.844021519280439e-06, "loss": 0.0808, "step": 897 }, { "epoch": 1.149807938540333, "grad_norm": 0.20900364696492912, "learning_rate": 3.83424000247793e-06, "loss": 0.0638, "step": 898 }, { "epoch": 1.151088348271447, "grad_norm": 0.1994134076049075, "learning_rate": 3.824463201381067e-06, "loss": 0.0631, "step": 899 }, { "epoch": 1.1523687580025608, "grad_norm": 0.19599934165962632, "learning_rate": 3.8146911555387416e-06, "loss": 0.0588, "step": 900 }, { "epoch": 1.1536491677336747, "grad_norm": 0.20821179550958469, "learning_rate": 3.8049239044806074e-06, "loss": 0.0624, "step": 901 }, { "epoch": 1.1549295774647887, "grad_norm": 0.18890230389643925, "learning_rate": 3.7951614877169285e-06, "loss": 0.0637, "step": 902 }, { "epoch": 1.1562099871959026, "grad_norm": 0.19184760243149585, "learning_rate": 3.7854039447384066e-06, "loss": 0.0622, "step": 903 }, { "epoch": 1.1574903969270167, "grad_norm": 0.19922392902809172, "learning_rate": 3.7756513150160324e-06, "loss": 0.0645, "step": 904 }, { "epoch": 1.1587708066581306, "grad_norm": 0.2115894101616335, "learning_rate": 3.7659036380009195e-06, "loss": 0.0763, "step": 905 }, { "epoch": 1.1600512163892445, "grad_norm": 0.19518468422654678, "learning_rate": 3.7561609531241495e-06, "loss": 0.0554, "step": 906 }, { "epoch": 1.1613316261203586, "grad_norm": 0.19065468355200652, "learning_rate": 3.746423299796604e-06, "loss": 0.0621, "step": 907 }, { "epoch": 1.1626120358514724, "grad_norm": 0.21815326268117458, "learning_rate": 3.7366907174088207e-06, "loss": 0.0835, "step": 908 }, { "epoch": 1.1638924455825865, "grad_norm": 0.23235350571808625, "learning_rate": 3.7269632453308147e-06, "loss": 0.0624, "step": 909 }, { "epoch": 1.1651728553137004, "grad_norm": 0.1893073867880245, "learning_rate": 3.7172409229119353e-06, "loss": 0.0544, "step": 910 }, { "epoch": 1.1664532650448143, "grad_norm": 0.19700425438521607, "learning_rate": 3.707523789480699e-06, "loss": 0.058, "step": 911 }, { "epoch": 1.1677336747759284, "grad_norm": 0.1924564630829821, "learning_rate": 3.6978118843446296e-06, "loss": 0.057, "step": 912 }, { "epoch": 1.1690140845070423, "grad_norm": 0.19224829294167367, "learning_rate": 3.6881052467901056e-06, "loss": 0.0626, "step": 913 }, { "epoch": 1.1702944942381561, "grad_norm": 0.20581104695239014, "learning_rate": 3.6784039160821927e-06, "loss": 0.0719, "step": 914 }, { "epoch": 1.1715749039692702, "grad_norm": 0.21268995197608395, "learning_rate": 3.6687079314644915e-06, "loss": 0.0681, "step": 915 }, { "epoch": 1.172855313700384, "grad_norm": 0.22724624136888913, "learning_rate": 3.6590173321589807e-06, "loss": 0.0797, "step": 916 }, { "epoch": 1.174135723431498, "grad_norm": 0.21155491144125307, "learning_rate": 3.6493321573658475e-06, "loss": 0.0746, "step": 917 }, { "epoch": 1.175416133162612, "grad_norm": 0.2257711234149272, "learning_rate": 3.6396524462633376e-06, "loss": 0.0859, "step": 918 }, { "epoch": 1.176696542893726, "grad_norm": 0.21314211779904535, "learning_rate": 3.6299782380076004e-06, "loss": 0.0567, "step": 919 }, { "epoch": 1.17797695262484, "grad_norm": 0.2014886304871662, "learning_rate": 3.6203095717325187e-06, "loss": 0.0616, "step": 920 }, { "epoch": 1.179257362355954, "grad_norm": 0.21305343512658692, "learning_rate": 3.610646486549563e-06, "loss": 0.0866, "step": 921 }, { "epoch": 1.1805377720870678, "grad_norm": 0.19346411248269488, "learning_rate": 3.6009890215476227e-06, "loss": 0.0552, "step": 922 }, { "epoch": 1.1818181818181819, "grad_norm": 0.2033385377656553, "learning_rate": 3.5913372157928515e-06, "loss": 0.0669, "step": 923 }, { "epoch": 1.1830985915492958, "grad_norm": 0.2211407490118448, "learning_rate": 3.5816911083285165e-06, "loss": 0.085, "step": 924 }, { "epoch": 1.1843790012804098, "grad_norm": 0.19866123925839885, "learning_rate": 3.572050738174831e-06, "loss": 0.0639, "step": 925 }, { "epoch": 1.1856594110115237, "grad_norm": 0.2135176986380743, "learning_rate": 3.5624161443287954e-06, "loss": 0.0735, "step": 926 }, { "epoch": 1.1869398207426376, "grad_norm": 0.23412336745331916, "learning_rate": 3.552787365764053e-06, "loss": 0.0837, "step": 927 }, { "epoch": 1.1882202304737517, "grad_norm": 0.19048581888992958, "learning_rate": 3.5431644414307136e-06, "loss": 0.0648, "step": 928 }, { "epoch": 1.1895006402048656, "grad_norm": 0.18993275011232116, "learning_rate": 3.5335474102552152e-06, "loss": 0.0574, "step": 929 }, { "epoch": 1.1907810499359794, "grad_norm": 0.17666251309434333, "learning_rate": 3.5239363111401487e-06, "loss": 0.051, "step": 930 }, { "epoch": 1.1920614596670935, "grad_norm": 0.2119920725830039, "learning_rate": 3.5143311829641117e-06, "loss": 0.0688, "step": 931 }, { "epoch": 1.1933418693982074, "grad_norm": 0.1913907733461757, "learning_rate": 3.504732064581553e-06, "loss": 0.0596, "step": 932 }, { "epoch": 1.1946222791293213, "grad_norm": 0.19227359679889355, "learning_rate": 3.495138994822603e-06, "loss": 0.0578, "step": 933 }, { "epoch": 1.1959026888604354, "grad_norm": 0.19920660408783095, "learning_rate": 3.4855520124929267e-06, "loss": 0.0685, "step": 934 }, { "epoch": 1.1971830985915493, "grad_norm": 0.20393682104067662, "learning_rate": 3.4759711563735676e-06, "loss": 0.0667, "step": 935 }, { "epoch": 1.1984635083226634, "grad_norm": 0.20538394739659105, "learning_rate": 3.4663964652207844e-06, "loss": 0.0681, "step": 936 }, { "epoch": 1.1997439180537772, "grad_norm": 0.17967704694762326, "learning_rate": 3.456827977765901e-06, "loss": 0.0548, "step": 937 }, { "epoch": 1.201024327784891, "grad_norm": 0.18771533064203305, "learning_rate": 3.447265732715142e-06, "loss": 0.0514, "step": 938 }, { "epoch": 1.2023047375160052, "grad_norm": 0.1969305329313409, "learning_rate": 3.4377097687494797e-06, "loss": 0.055, "step": 939 }, { "epoch": 1.203585147247119, "grad_norm": 0.20748933143028994, "learning_rate": 3.4281601245244843e-06, "loss": 0.0776, "step": 940 }, { "epoch": 1.204865556978233, "grad_norm": 0.2081746036494643, "learning_rate": 3.4186168386701573e-06, "loss": 0.0732, "step": 941 }, { "epoch": 1.206145966709347, "grad_norm": 0.19375763722815328, "learning_rate": 3.4090799497907778e-06, "loss": 0.0637, "step": 942 }, { "epoch": 1.207426376440461, "grad_norm": 0.2147037599281101, "learning_rate": 3.3995494964647536e-06, "loss": 0.0725, "step": 943 }, { "epoch": 1.2087067861715748, "grad_norm": 0.187165510189722, "learning_rate": 3.390025517244452e-06, "loss": 0.0588, "step": 944 }, { "epoch": 1.2099871959026889, "grad_norm": 0.19568064092552032, "learning_rate": 3.3805080506560607e-06, "loss": 0.0551, "step": 945 }, { "epoch": 1.2112676056338028, "grad_norm": 0.19785126157295427, "learning_rate": 3.370997135199413e-06, "loss": 0.0689, "step": 946 }, { "epoch": 1.2125480153649169, "grad_norm": 0.21322957980340346, "learning_rate": 3.3614928093478485e-06, "loss": 0.0885, "step": 947 }, { "epoch": 1.2138284250960307, "grad_norm": 0.2024752037433109, "learning_rate": 3.35199511154805e-06, "loss": 0.0705, "step": 948 }, { "epoch": 1.2151088348271446, "grad_norm": 0.20281101799626422, "learning_rate": 3.342504080219885e-06, "loss": 0.069, "step": 949 }, { "epoch": 1.2163892445582587, "grad_norm": 0.19648869341063255, "learning_rate": 3.3330197537562544e-06, "loss": 0.0573, "step": 950 }, { "epoch": 1.2176696542893726, "grad_norm": 0.18234273190312156, "learning_rate": 3.3235421705229415e-06, "loss": 0.0523, "step": 951 }, { "epoch": 1.2189500640204867, "grad_norm": 0.20438706587175018, "learning_rate": 3.3140713688584493e-06, "loss": 0.0709, "step": 952 }, { "epoch": 1.2202304737516005, "grad_norm": 0.18820665125254896, "learning_rate": 3.3046073870738442e-06, "loss": 0.0582, "step": 953 }, { "epoch": 1.2215108834827144, "grad_norm": 0.18844758201738432, "learning_rate": 3.295150263452613e-06, "loss": 0.0589, "step": 954 }, { "epoch": 1.2227912932138285, "grad_norm": 0.18539253505498182, "learning_rate": 3.285700036250491e-06, "loss": 0.0586, "step": 955 }, { "epoch": 1.2240717029449424, "grad_norm": 0.19481299926916076, "learning_rate": 3.2762567436953245e-06, "loss": 0.0671, "step": 956 }, { "epoch": 1.2253521126760563, "grad_norm": 0.1948197760762527, "learning_rate": 3.2668204239869046e-06, "loss": 0.0579, "step": 957 }, { "epoch": 1.2266325224071704, "grad_norm": 0.22354629769065412, "learning_rate": 3.2573911152968114e-06, "loss": 0.0672, "step": 958 }, { "epoch": 1.2279129321382842, "grad_norm": 0.18448531610071453, "learning_rate": 3.247968855768273e-06, "loss": 0.0521, "step": 959 }, { "epoch": 1.229193341869398, "grad_norm": 0.18636815096811726, "learning_rate": 3.238553683515996e-06, "loss": 0.0534, "step": 960 }, { "epoch": 1.2304737516005122, "grad_norm": 0.21002005305932167, "learning_rate": 3.2291456366260184e-06, "loss": 0.071, "step": 961 }, { "epoch": 1.231754161331626, "grad_norm": 0.19623216635718685, "learning_rate": 3.2197447531555588e-06, "loss": 0.0582, "step": 962 }, { "epoch": 1.2330345710627402, "grad_norm": 0.20197341774195168, "learning_rate": 3.2103510711328523e-06, "loss": 0.0692, "step": 963 }, { "epoch": 1.234314980793854, "grad_norm": 0.20728311008489456, "learning_rate": 3.2009646285570105e-06, "loss": 0.07, "step": 964 }, { "epoch": 1.235595390524968, "grad_norm": 0.20660837904834334, "learning_rate": 3.191585463397854e-06, "loss": 0.0756, "step": 965 }, { "epoch": 1.236875800256082, "grad_norm": 0.1929313637565013, "learning_rate": 3.182213613595764e-06, "loss": 0.0582, "step": 966 }, { "epoch": 1.2381562099871959, "grad_norm": 0.19745424704519243, "learning_rate": 3.1728491170615362e-06, "loss": 0.0566, "step": 967 }, { "epoch": 1.2394366197183098, "grad_norm": 0.20015632181474693, "learning_rate": 3.1634920116762175e-06, "loss": 0.0657, "step": 968 }, { "epoch": 1.2407170294494239, "grad_norm": 0.21301826511097216, "learning_rate": 3.1541423352909532e-06, "loss": 0.0833, "step": 969 }, { "epoch": 1.2419974391805377, "grad_norm": 0.20107344658071108, "learning_rate": 3.144800125726845e-06, "loss": 0.067, "step": 970 }, { "epoch": 1.2432778489116516, "grad_norm": 0.18926958819122713, "learning_rate": 3.1354654207747804e-06, "loss": 0.0547, "step": 971 }, { "epoch": 1.2445582586427657, "grad_norm": 0.2108441275256878, "learning_rate": 3.1261382581952967e-06, "loss": 0.0695, "step": 972 }, { "epoch": 1.2458386683738796, "grad_norm": 0.20017359308994043, "learning_rate": 3.1168186757184182e-06, "loss": 0.0624, "step": 973 }, { "epoch": 1.2471190781049937, "grad_norm": 0.22913033136835168, "learning_rate": 3.1075067110435055e-06, "loss": 0.0978, "step": 974 }, { "epoch": 1.2483994878361075, "grad_norm": 0.21076083893984035, "learning_rate": 3.098202401839106e-06, "loss": 0.0747, "step": 975 }, { "epoch": 1.2496798975672214, "grad_norm": 0.19062548940915808, "learning_rate": 3.0889057857427974e-06, "loss": 0.0665, "step": 976 }, { "epoch": 1.2509603072983355, "grad_norm": 0.18586331448712382, "learning_rate": 3.0796169003610364e-06, "loss": 0.0519, "step": 977 }, { "epoch": 1.2522407170294494, "grad_norm": 0.19624503800378734, "learning_rate": 3.070335783269015e-06, "loss": 0.0595, "step": 978 }, { "epoch": 1.2535211267605635, "grad_norm": 0.1961084701085617, "learning_rate": 3.061062472010489e-06, "loss": 0.0596, "step": 979 }, { "epoch": 1.2548015364916774, "grad_norm": 0.21305800798730112, "learning_rate": 3.0517970040976496e-06, "loss": 0.0676, "step": 980 }, { "epoch": 1.2560819462227912, "grad_norm": 0.18565431069698893, "learning_rate": 3.0425394170109537e-06, "loss": 0.048, "step": 981 }, { "epoch": 1.2573623559539053, "grad_norm": 0.1889659607019294, "learning_rate": 3.0332897481989794e-06, "loss": 0.0607, "step": 982 }, { "epoch": 1.2586427656850192, "grad_norm": 0.19919257304976531, "learning_rate": 3.0240480350782765e-06, "loss": 0.0668, "step": 983 }, { "epoch": 1.2599231754161333, "grad_norm": 0.20110260429492127, "learning_rate": 3.0148143150332116e-06, "loss": 0.0697, "step": 984 }, { "epoch": 1.2612035851472472, "grad_norm": 0.19832562790211553, "learning_rate": 3.005588625415815e-06, "loss": 0.0582, "step": 985 }, { "epoch": 1.262483994878361, "grad_norm": 0.20371844984614523, "learning_rate": 2.9963710035456393e-06, "loss": 0.0628, "step": 986 }, { "epoch": 1.263764404609475, "grad_norm": 0.19589763682357353, "learning_rate": 2.987161486709593e-06, "loss": 0.0559, "step": 987 }, { "epoch": 1.265044814340589, "grad_norm": 0.19395260094460662, "learning_rate": 2.977960112161805e-06, "loss": 0.0694, "step": 988 }, { "epoch": 1.266325224071703, "grad_norm": 0.19929766629432427, "learning_rate": 2.968766917123467e-06, "loss": 0.0667, "step": 989 }, { "epoch": 1.267605633802817, "grad_norm": 0.2140915395436552, "learning_rate": 2.9595819387826753e-06, "loss": 0.0795, "step": 990 }, { "epoch": 1.2688860435339309, "grad_norm": 0.21619113042785995, "learning_rate": 2.9504052142943e-06, "loss": 0.0914, "step": 991 }, { "epoch": 1.2701664532650447, "grad_norm": 0.195806454703641, "learning_rate": 2.941236780779813e-06, "loss": 0.0654, "step": 992 }, { "epoch": 1.2714468629961588, "grad_norm": 0.21162047202087214, "learning_rate": 2.9320766753271525e-06, "loss": 0.0662, "step": 993 }, { "epoch": 1.2727272727272727, "grad_norm": 0.20641866716083257, "learning_rate": 2.9229249349905686e-06, "loss": 0.0644, "step": 994 }, { "epoch": 1.2740076824583868, "grad_norm": 0.20012936752081067, "learning_rate": 2.9137815967904703e-06, "loss": 0.0741, "step": 995 }, { "epoch": 1.2752880921895007, "grad_norm": 0.2204342094901664, "learning_rate": 2.904646697713279e-06, "loss": 0.0659, "step": 996 }, { "epoch": 1.2765685019206146, "grad_norm": 0.25913495945005405, "learning_rate": 2.895520274711282e-06, "loss": 0.1011, "step": 997 }, { "epoch": 1.2778489116517284, "grad_norm": 0.20347261247334922, "learning_rate": 2.886402364702474e-06, "loss": 0.0716, "step": 998 }, { "epoch": 1.2791293213828425, "grad_norm": 0.1877325082861712, "learning_rate": 2.8772930045704182e-06, "loss": 0.0587, "step": 999 }, { "epoch": 1.2804097311139564, "grad_norm": 0.19980934556904612, "learning_rate": 2.8681922311640896e-06, "loss": 0.0664, "step": 1000 }, { "epoch": 1.2804097311139564, "eval_loss": 0.08692000806331635, "eval_runtime": 10.9118, "eval_samples_per_second": 23.186, "eval_steps_per_second": 5.865, "step": 1000 }, { "epoch": 1.2816901408450705, "grad_norm": 0.203415388207754, "learning_rate": 2.8591000812977245e-06, "loss": 0.0681, "step": 1001 }, { "epoch": 1.2829705505761844, "grad_norm": 0.19548831301389222, "learning_rate": 2.850016591750683e-06, "loss": 0.0697, "step": 1002 }, { "epoch": 1.2842509603072982, "grad_norm": 0.2040080430447603, "learning_rate": 2.8409417992672883e-06, "loss": 0.0781, "step": 1003 }, { "epoch": 1.2855313700384123, "grad_norm": 0.2058165124592602, "learning_rate": 2.831875740556681e-06, "loss": 0.083, "step": 1004 }, { "epoch": 1.2868117797695262, "grad_norm": 0.20219967978799114, "learning_rate": 2.822818452292676e-06, "loss": 0.0651, "step": 1005 }, { "epoch": 1.2880921895006403, "grad_norm": 0.20517453849308098, "learning_rate": 2.8137699711136084e-06, "loss": 0.0703, "step": 1006 }, { "epoch": 1.2893725992317542, "grad_norm": 0.1977334322840122, "learning_rate": 2.8047303336221887e-06, "loss": 0.0625, "step": 1007 }, { "epoch": 1.290653008962868, "grad_norm": 0.21064106480882136, "learning_rate": 2.7956995763853495e-06, "loss": 0.0653, "step": 1008 }, { "epoch": 1.2919334186939821, "grad_norm": 0.18715097373290757, "learning_rate": 2.7866777359341013e-06, "loss": 0.0552, "step": 1009 }, { "epoch": 1.293213828425096, "grad_norm": 0.2125896532946369, "learning_rate": 2.777664848763391e-06, "loss": 0.0649, "step": 1010 }, { "epoch": 1.2944942381562101, "grad_norm": 0.1911876429725511, "learning_rate": 2.7686609513319405e-06, "loss": 0.0669, "step": 1011 }, { "epoch": 1.295774647887324, "grad_norm": 0.19887530995180913, "learning_rate": 2.7596660800621076e-06, "loss": 0.0643, "step": 1012 }, { "epoch": 1.2970550576184379, "grad_norm": 0.20987730316055397, "learning_rate": 2.7506802713397452e-06, "loss": 0.0601, "step": 1013 }, { "epoch": 1.2983354673495517, "grad_norm": 0.1955974658910418, "learning_rate": 2.7417035615140343e-06, "loss": 0.0631, "step": 1014 }, { "epoch": 1.2996158770806658, "grad_norm": 0.19188767175912524, "learning_rate": 2.732735986897361e-06, "loss": 0.0532, "step": 1015 }, { "epoch": 1.3008962868117797, "grad_norm": 0.20278711768860316, "learning_rate": 2.7237775837651505e-06, "loss": 0.0606, "step": 1016 }, { "epoch": 1.3021766965428938, "grad_norm": 0.211266017127511, "learning_rate": 2.714828388355729e-06, "loss": 0.0758, "step": 1017 }, { "epoch": 1.3034571062740077, "grad_norm": 0.2055968697072138, "learning_rate": 2.7058884368701817e-06, "loss": 0.0725, "step": 1018 }, { "epoch": 1.3047375160051216, "grad_norm": 0.20102678912316368, "learning_rate": 2.6969577654721914e-06, "loss": 0.0669, "step": 1019 }, { "epoch": 1.3060179257362357, "grad_norm": 0.19011196343475334, "learning_rate": 2.688036410287904e-06, "loss": 0.0636, "step": 1020 }, { "epoch": 1.3072983354673495, "grad_norm": 0.201915084250308, "learning_rate": 2.6791244074057864e-06, "loss": 0.0616, "step": 1021 }, { "epoch": 1.3085787451984636, "grad_norm": 0.21770137154211996, "learning_rate": 2.670221792876465e-06, "loss": 0.0828, "step": 1022 }, { "epoch": 1.3098591549295775, "grad_norm": 0.21327214460941235, "learning_rate": 2.661328602712592e-06, "loss": 0.0718, "step": 1023 }, { "epoch": 1.3111395646606914, "grad_norm": 0.20552700432241902, "learning_rate": 2.652444872888699e-06, "loss": 0.0641, "step": 1024 }, { "epoch": 1.3124199743918052, "grad_norm": 0.20387405866133654, "learning_rate": 2.643570639341042e-06, "loss": 0.0671, "step": 1025 }, { "epoch": 1.3137003841229193, "grad_norm": 0.21070423281560757, "learning_rate": 2.634705937967471e-06, "loss": 0.0835, "step": 1026 }, { "epoch": 1.3149807938540332, "grad_norm": 0.20462453816924173, "learning_rate": 2.6258508046272745e-06, "loss": 0.0725, "step": 1027 }, { "epoch": 1.3162612035851473, "grad_norm": 0.18524750769058604, "learning_rate": 2.6170052751410313e-06, "loss": 0.0517, "step": 1028 }, { "epoch": 1.3175416133162612, "grad_norm": 0.1948489714270196, "learning_rate": 2.6081693852904773e-06, "loss": 0.057, "step": 1029 }, { "epoch": 1.318822023047375, "grad_norm": 0.2078148834096972, "learning_rate": 2.5993431708183515e-06, "loss": 0.063, "step": 1030 }, { "epoch": 1.3201024327784892, "grad_norm": 0.21272279725400223, "learning_rate": 2.590526667428251e-06, "loss": 0.0651, "step": 1031 }, { "epoch": 1.321382842509603, "grad_norm": 0.19050494899615844, "learning_rate": 2.581719910784498e-06, "loss": 0.0598, "step": 1032 }, { "epoch": 1.3226632522407171, "grad_norm": 0.1950095467311816, "learning_rate": 2.57292293651198e-06, "loss": 0.0613, "step": 1033 }, { "epoch": 1.323943661971831, "grad_norm": 0.21011797658366904, "learning_rate": 2.5641357801960186e-06, "loss": 0.0708, "step": 1034 }, { "epoch": 1.3252240717029449, "grad_norm": 0.20253570169710466, "learning_rate": 2.555358477382215e-06, "loss": 0.0634, "step": 1035 }, { "epoch": 1.326504481434059, "grad_norm": 0.21881986911824405, "learning_rate": 2.5465910635763125e-06, "loss": 0.0755, "step": 1036 }, { "epoch": 1.3277848911651728, "grad_norm": 0.1991374543662798, "learning_rate": 2.537833574244054e-06, "loss": 0.0622, "step": 1037 }, { "epoch": 1.329065300896287, "grad_norm": 0.19674401294389543, "learning_rate": 2.5290860448110377e-06, "loss": 0.0611, "step": 1038 }, { "epoch": 1.3303457106274008, "grad_norm": 0.20274435418584644, "learning_rate": 2.5203485106625642e-06, "loss": 0.0712, "step": 1039 }, { "epoch": 1.3316261203585147, "grad_norm": 0.222889035262687, "learning_rate": 2.511621007143511e-06, "loss": 0.0899, "step": 1040 }, { "epoch": 1.3329065300896286, "grad_norm": 0.19506032921044036, "learning_rate": 2.5029035695581718e-06, "loss": 0.0663, "step": 1041 }, { "epoch": 1.3341869398207427, "grad_norm": 0.19661504155459883, "learning_rate": 2.4941962331701287e-06, "loss": 0.0663, "step": 1042 }, { "epoch": 1.3354673495518565, "grad_norm": 0.2081741014839576, "learning_rate": 2.4854990332020978e-06, "loss": 0.0705, "step": 1043 }, { "epoch": 1.3367477592829706, "grad_norm": 0.18623514337411173, "learning_rate": 2.476812004835791e-06, "loss": 0.0576, "step": 1044 }, { "epoch": 1.3380281690140845, "grad_norm": 0.2071825392783579, "learning_rate": 2.4681351832117815e-06, "loss": 0.0767, "step": 1045 }, { "epoch": 1.3393085787451984, "grad_norm": 0.2013658424749003, "learning_rate": 2.4594686034293454e-06, "loss": 0.0669, "step": 1046 }, { "epoch": 1.3405889884763125, "grad_norm": 0.20310751510488811, "learning_rate": 2.450812300546335e-06, "loss": 0.0707, "step": 1047 }, { "epoch": 1.3418693982074263, "grad_norm": 0.19742170251319627, "learning_rate": 2.442166309579026e-06, "loss": 0.061, "step": 1048 }, { "epoch": 1.3431498079385404, "grad_norm": 0.19425515205085422, "learning_rate": 2.4335306655019854e-06, "loss": 0.0657, "step": 1049 }, { "epoch": 1.3444302176696543, "grad_norm": 0.2017221762304742, "learning_rate": 2.424905403247923e-06, "loss": 0.0724, "step": 1050 }, { "epoch": 1.3457106274007682, "grad_norm": 0.21529685145543417, "learning_rate": 2.416290557707552e-06, "loss": 0.0743, "step": 1051 }, { "epoch": 1.3469910371318823, "grad_norm": 0.20135696008855986, "learning_rate": 2.407686163729445e-06, "loss": 0.0689, "step": 1052 }, { "epoch": 1.3482714468629962, "grad_norm": 0.1942596570025383, "learning_rate": 2.399092256119904e-06, "loss": 0.0603, "step": 1053 }, { "epoch": 1.34955185659411, "grad_norm": 0.19965254670083077, "learning_rate": 2.390508869642806e-06, "loss": 0.0576, "step": 1054 }, { "epoch": 1.3508322663252241, "grad_norm": 0.21712472667872032, "learning_rate": 2.381936039019466e-06, "loss": 0.0786, "step": 1055 }, { "epoch": 1.352112676056338, "grad_norm": 0.21552734921720448, "learning_rate": 2.373373798928507e-06, "loss": 0.0741, "step": 1056 }, { "epoch": 1.3533930857874519, "grad_norm": 0.20538057728527542, "learning_rate": 2.364822184005703e-06, "loss": 0.0608, "step": 1057 }, { "epoch": 1.354673495518566, "grad_norm": 0.18046000723143035, "learning_rate": 2.356281228843852e-06, "loss": 0.0504, "step": 1058 }, { "epoch": 1.3559539052496798, "grad_norm": 0.21800838300979852, "learning_rate": 2.347750967992632e-06, "loss": 0.0881, "step": 1059 }, { "epoch": 1.357234314980794, "grad_norm": 0.19312621624891768, "learning_rate": 2.3392314359584552e-06, "loss": 0.0565, "step": 1060 }, { "epoch": 1.3585147247119078, "grad_norm": 0.1870768498888908, "learning_rate": 2.3307226672043413e-06, "loss": 0.0521, "step": 1061 }, { "epoch": 1.3597951344430217, "grad_norm": 0.2033470241469759, "learning_rate": 2.3222246961497658e-06, "loss": 0.0734, "step": 1062 }, { "epoch": 1.3610755441741358, "grad_norm": 0.20073653854118614, "learning_rate": 2.3137375571705236e-06, "loss": 0.0651, "step": 1063 }, { "epoch": 1.3623559539052497, "grad_norm": 0.19896092881438215, "learning_rate": 2.305261284598599e-06, "loss": 0.0597, "step": 1064 }, { "epoch": 1.3636363636363638, "grad_norm": 0.1881553928388899, "learning_rate": 2.296795912722014e-06, "loss": 0.0509, "step": 1065 }, { "epoch": 1.3649167733674776, "grad_norm": 0.21350879123823702, "learning_rate": 2.288341475784695e-06, "loss": 0.0764, "step": 1066 }, { "epoch": 1.3661971830985915, "grad_norm": 0.204331622858107, "learning_rate": 2.2798980079863386e-06, "loss": 0.0714, "step": 1067 }, { "epoch": 1.3674775928297054, "grad_norm": 0.2044766457853502, "learning_rate": 2.271465543482269e-06, "loss": 0.0733, "step": 1068 }, { "epoch": 1.3687580025608195, "grad_norm": 0.21965982410969004, "learning_rate": 2.2630441163832957e-06, "loss": 0.0733, "step": 1069 }, { "epoch": 1.3700384122919333, "grad_norm": 0.20215533899899032, "learning_rate": 2.254633760755585e-06, "loss": 0.0678, "step": 1070 }, { "epoch": 1.3713188220230474, "grad_norm": 0.2054466217305257, "learning_rate": 2.2462345106205124e-06, "loss": 0.0683, "step": 1071 }, { "epoch": 1.3725992317541613, "grad_norm": 0.19724022069164437, "learning_rate": 2.2378463999545353e-06, "loss": 0.062, "step": 1072 }, { "epoch": 1.3738796414852752, "grad_norm": 0.2156286162059053, "learning_rate": 2.229469462689045e-06, "loss": 0.0723, "step": 1073 }, { "epoch": 1.3751600512163893, "grad_norm": 0.2066441343712975, "learning_rate": 2.221103732710235e-06, "loss": 0.0741, "step": 1074 }, { "epoch": 1.3764404609475032, "grad_norm": 0.203281348300415, "learning_rate": 2.2127492438589677e-06, "loss": 0.0686, "step": 1075 }, { "epoch": 1.3777208706786173, "grad_norm": 0.1926530983142129, "learning_rate": 2.204406029930627e-06, "loss": 0.0542, "step": 1076 }, { "epoch": 1.3790012804097311, "grad_norm": 0.19071223840152413, "learning_rate": 2.196074124674994e-06, "loss": 0.06, "step": 1077 }, { "epoch": 1.380281690140845, "grad_norm": 0.21057562501520644, "learning_rate": 2.187753561796097e-06, "loss": 0.0757, "step": 1078 }, { "epoch": 1.381562099871959, "grad_norm": 0.20857856263521446, "learning_rate": 2.179444374952089e-06, "loss": 0.0728, "step": 1079 }, { "epoch": 1.382842509603073, "grad_norm": 0.195437719211652, "learning_rate": 2.171146597755104e-06, "loss": 0.0598, "step": 1080 }, { "epoch": 1.384122919334187, "grad_norm": 0.19566391975670985, "learning_rate": 2.162860263771118e-06, "loss": 0.0607, "step": 1081 }, { "epoch": 1.385403329065301, "grad_norm": 0.2017761269350869, "learning_rate": 2.1545854065198185e-06, "loss": 0.0728, "step": 1082 }, { "epoch": 1.3866837387964148, "grad_norm": 0.1941103437233159, "learning_rate": 2.146322059474471e-06, "loss": 0.059, "step": 1083 }, { "epoch": 1.3879641485275287, "grad_norm": 0.22287852401728198, "learning_rate": 2.1380702560617754e-06, "loss": 0.057, "step": 1084 }, { "epoch": 1.3892445582586428, "grad_norm": 0.23319868505583136, "learning_rate": 2.129830029661741e-06, "loss": 0.0608, "step": 1085 }, { "epoch": 1.3905249679897567, "grad_norm": 0.20092456767311398, "learning_rate": 2.1216014136075404e-06, "loss": 0.0611, "step": 1086 }, { "epoch": 1.3918053777208708, "grad_norm": 0.19897370637445758, "learning_rate": 2.1133844411853814e-06, "loss": 0.0634, "step": 1087 }, { "epoch": 1.3930857874519846, "grad_norm": 0.20739404858456598, "learning_rate": 2.105179145634377e-06, "loss": 0.0633, "step": 1088 }, { "epoch": 1.3943661971830985, "grad_norm": 0.20875093375942824, "learning_rate": 2.0969855601463966e-06, "loss": 0.0568, "step": 1089 }, { "epoch": 1.3956466069142126, "grad_norm": 0.1934014271741433, "learning_rate": 2.0888037178659472e-06, "loss": 0.0667, "step": 1090 }, { "epoch": 1.3969270166453265, "grad_norm": 0.19711939309450682, "learning_rate": 2.08063365189003e-06, "loss": 0.0652, "step": 1091 }, { "epoch": 1.3982074263764406, "grad_norm": 0.20862615972374418, "learning_rate": 2.0724753952680083e-06, "loss": 0.0708, "step": 1092 }, { "epoch": 1.3994878361075545, "grad_norm": 0.19619045067066884, "learning_rate": 2.064328981001473e-06, "loss": 0.0655, "step": 1093 }, { "epoch": 1.4007682458386683, "grad_norm": 0.1944261648713056, "learning_rate": 2.0561944420441154e-06, "loss": 0.0546, "step": 1094 }, { "epoch": 1.4020486555697822, "grad_norm": 0.1932080370427703, "learning_rate": 2.0480718113015834e-06, "loss": 0.0619, "step": 1095 }, { "epoch": 1.4033290653008963, "grad_norm": 0.20792951167906953, "learning_rate": 2.0399611216313604e-06, "loss": 0.0732, "step": 1096 }, { "epoch": 1.4046094750320102, "grad_norm": 0.1968440443560805, "learning_rate": 2.03186240584262e-06, "loss": 0.0648, "step": 1097 }, { "epoch": 1.4058898847631243, "grad_norm": 0.21474596287946537, "learning_rate": 2.023775696696101e-06, "loss": 0.0781, "step": 1098 }, { "epoch": 1.4071702944942381, "grad_norm": 0.2168190475752317, "learning_rate": 2.015701026903975e-06, "loss": 0.0761, "step": 1099 }, { "epoch": 1.408450704225352, "grad_norm": 0.21611834343982658, "learning_rate": 2.0076384291297134e-06, "loss": 0.0828, "step": 1100 }, { "epoch": 1.409731113956466, "grad_norm": 0.22766536725252917, "learning_rate": 1.999587935987949e-06, "loss": 0.0954, "step": 1101 }, { "epoch": 1.41101152368758, "grad_norm": 0.19266508726514534, "learning_rate": 1.991549580044355e-06, "loss": 0.0596, "step": 1102 }, { "epoch": 1.412291933418694, "grad_norm": 0.21630576282290262, "learning_rate": 1.9835233938155023e-06, "loss": 0.0669, "step": 1103 }, { "epoch": 1.413572343149808, "grad_norm": 0.20174611073748253, "learning_rate": 1.9755094097687384e-06, "loss": 0.0668, "step": 1104 }, { "epoch": 1.4148527528809218, "grad_norm": 0.20010657619622432, "learning_rate": 1.9675076603220462e-06, "loss": 0.0647, "step": 1105 }, { "epoch": 1.416133162612036, "grad_norm": 0.19288244757303488, "learning_rate": 1.959518177843918e-06, "loss": 0.0598, "step": 1106 }, { "epoch": 1.4174135723431498, "grad_norm": 0.20588594532173718, "learning_rate": 1.9515409946532277e-06, "loss": 0.0625, "step": 1107 }, { "epoch": 1.418693982074264, "grad_norm": 0.20146791447946594, "learning_rate": 1.943576143019094e-06, "loss": 0.0702, "step": 1108 }, { "epoch": 1.4199743918053778, "grad_norm": 0.20636437068819466, "learning_rate": 1.93562365516075e-06, "loss": 0.062, "step": 1109 }, { "epoch": 1.4212548015364916, "grad_norm": 0.2139563521711762, "learning_rate": 1.9276835632474183e-06, "loss": 0.077, "step": 1110 }, { "epoch": 1.4225352112676055, "grad_norm": 0.191821847104388, "learning_rate": 1.9197558993981784e-06, "loss": 0.054, "step": 1111 }, { "epoch": 1.4238156209987196, "grad_norm": 0.2078059078160054, "learning_rate": 1.9118406956818352e-06, "loss": 0.0727, "step": 1112 }, { "epoch": 1.4250960307298335, "grad_norm": 0.1905414879633392, "learning_rate": 1.9039379841167877e-06, "loss": 0.0545, "step": 1113 }, { "epoch": 1.4263764404609476, "grad_norm": 0.19166606302181505, "learning_rate": 1.896047796670903e-06, "loss": 0.0602, "step": 1114 }, { "epoch": 1.4276568501920615, "grad_norm": 0.22398301089740988, "learning_rate": 1.8881701652613887e-06, "loss": 0.0955, "step": 1115 }, { "epoch": 1.4289372599231753, "grad_norm": 0.2229678516718712, "learning_rate": 1.8803051217546586e-06, "loss": 0.0798, "step": 1116 }, { "epoch": 1.4302176696542894, "grad_norm": 0.1911896401200946, "learning_rate": 1.8724526979662045e-06, "loss": 0.0545, "step": 1117 }, { "epoch": 1.4314980793854033, "grad_norm": 0.20616222555584113, "learning_rate": 1.8646129256604738e-06, "loss": 0.0684, "step": 1118 }, { "epoch": 1.4327784891165174, "grad_norm": 0.22109035198084684, "learning_rate": 1.856785836550732e-06, "loss": 0.0878, "step": 1119 }, { "epoch": 1.4340588988476313, "grad_norm": 0.21639328095903118, "learning_rate": 1.8489714622989408e-06, "loss": 0.0616, "step": 1120 }, { "epoch": 1.4353393085787451, "grad_norm": 0.18610005774442434, "learning_rate": 1.841169834515631e-06, "loss": 0.0483, "step": 1121 }, { "epoch": 1.436619718309859, "grad_norm": 0.20644273766870688, "learning_rate": 1.8333809847597644e-06, "loss": 0.0676, "step": 1122 }, { "epoch": 1.4379001280409731, "grad_norm": 0.17944505367446414, "learning_rate": 1.825604944538622e-06, "loss": 0.0557, "step": 1123 }, { "epoch": 1.439180537772087, "grad_norm": 0.19824928829292426, "learning_rate": 1.8178417453076607e-06, "loss": 0.0557, "step": 1124 }, { "epoch": 1.440460947503201, "grad_norm": 0.21023697388500737, "learning_rate": 1.8100914184703956e-06, "loss": 0.0721, "step": 1125 }, { "epoch": 1.441741357234315, "grad_norm": 0.21389575811739556, "learning_rate": 1.8023539953782737e-06, "loss": 0.0796, "step": 1126 }, { "epoch": 1.4430217669654288, "grad_norm": 0.2316159023040796, "learning_rate": 1.7946295073305408e-06, "loss": 0.0688, "step": 1127 }, { "epoch": 1.444302176696543, "grad_norm": 0.19230672627931397, "learning_rate": 1.786917985574117e-06, "loss": 0.0562, "step": 1128 }, { "epoch": 1.4455825864276568, "grad_norm": 0.19468668400510344, "learning_rate": 1.7792194613034775e-06, "loss": 0.059, "step": 1129 }, { "epoch": 1.446862996158771, "grad_norm": 0.20557527007383872, "learning_rate": 1.7715339656605118e-06, "loss": 0.0643, "step": 1130 }, { "epoch": 1.4481434058898848, "grad_norm": 0.1928178976665305, "learning_rate": 1.7638615297344143e-06, "loss": 0.0596, "step": 1131 }, { "epoch": 1.4494238156209986, "grad_norm": 0.19873487332585918, "learning_rate": 1.7562021845615467e-06, "loss": 0.0657, "step": 1132 }, { "epoch": 1.4507042253521127, "grad_norm": 0.21745731851214492, "learning_rate": 1.748555961125315e-06, "loss": 0.0764, "step": 1133 }, { "epoch": 1.4519846350832266, "grad_norm": 0.2080960356027238, "learning_rate": 1.7409228903560498e-06, "loss": 0.0806, "step": 1134 }, { "epoch": 1.4532650448143407, "grad_norm": 0.22944878390080764, "learning_rate": 1.733303003130873e-06, "loss": 0.0679, "step": 1135 }, { "epoch": 1.4545454545454546, "grad_norm": 0.21369211565994412, "learning_rate": 1.7256963302735752e-06, "loss": 0.0703, "step": 1136 }, { "epoch": 1.4558258642765685, "grad_norm": 0.19033583010484845, "learning_rate": 1.7181029025544994e-06, "loss": 0.0575, "step": 1137 }, { "epoch": 1.4571062740076823, "grad_norm": 0.20555108660216753, "learning_rate": 1.7105227506904014e-06, "loss": 0.0585, "step": 1138 }, { "epoch": 1.4583866837387964, "grad_norm": 0.19308563927096448, "learning_rate": 1.7029559053443429e-06, "loss": 0.0608, "step": 1139 }, { "epoch": 1.4596670934699103, "grad_norm": 0.20154230585484242, "learning_rate": 1.6954023971255485e-06, "loss": 0.0692, "step": 1140 }, { "epoch": 1.4609475032010244, "grad_norm": 0.2124840403081628, "learning_rate": 1.6878622565892989e-06, "loss": 0.0687, "step": 1141 }, { "epoch": 1.4622279129321383, "grad_norm": 0.19357485619705922, "learning_rate": 1.6803355142367988e-06, "loss": 0.0652, "step": 1142 }, { "epoch": 1.4635083226632521, "grad_norm": 0.19530415381302244, "learning_rate": 1.672822200515054e-06, "loss": 0.0608, "step": 1143 }, { "epoch": 1.4647887323943662, "grad_norm": 0.21392038900544577, "learning_rate": 1.665322345816746e-06, "loss": 0.0712, "step": 1144 }, { "epoch": 1.4660691421254801, "grad_norm": 0.20187656435940052, "learning_rate": 1.6578359804801191e-06, "loss": 0.0673, "step": 1145 }, { "epoch": 1.4673495518565942, "grad_norm": 0.21316933147695688, "learning_rate": 1.650363134788844e-06, "loss": 0.0567, "step": 1146 }, { "epoch": 1.468629961587708, "grad_norm": 0.2028896407321813, "learning_rate": 1.6429038389719075e-06, "loss": 0.0587, "step": 1147 }, { "epoch": 1.469910371318822, "grad_norm": 0.19462151095432337, "learning_rate": 1.6354581232034811e-06, "loss": 0.0609, "step": 1148 }, { "epoch": 1.471190781049936, "grad_norm": 0.2064956849932238, "learning_rate": 1.6280260176028017e-06, "loss": 0.0666, "step": 1149 }, { "epoch": 1.47247119078105, "grad_norm": 0.21253123910046567, "learning_rate": 1.6206075522340565e-06, "loss": 0.0594, "step": 1150 }, { "epoch": 1.473751600512164, "grad_norm": 0.21434855786423096, "learning_rate": 1.6132027571062476e-06, "loss": 0.0781, "step": 1151 }, { "epoch": 1.475032010243278, "grad_norm": 0.23061193708509228, "learning_rate": 1.6058116621730851e-06, "loss": 0.0884, "step": 1152 }, { "epoch": 1.4763124199743918, "grad_norm": 0.2032449076437459, "learning_rate": 1.5984342973328581e-06, "loss": 0.0611, "step": 1153 }, { "epoch": 1.4775928297055057, "grad_norm": 0.21851209280783465, "learning_rate": 1.591070692428311e-06, "loss": 0.0609, "step": 1154 }, { "epoch": 1.4788732394366197, "grad_norm": 0.18063758488642642, "learning_rate": 1.583720877246533e-06, "loss": 0.0522, "step": 1155 }, { "epoch": 1.4801536491677336, "grad_norm": 0.30213673299017474, "learning_rate": 1.576384881518826e-06, "loss": 0.0844, "step": 1156 }, { "epoch": 1.4814340588988477, "grad_norm": 0.2086033122167656, "learning_rate": 1.5690627349205906e-06, "loss": 0.0781, "step": 1157 }, { "epoch": 1.4827144686299616, "grad_norm": 0.21504873382422657, "learning_rate": 1.5617544670712098e-06, "loss": 0.0762, "step": 1158 }, { "epoch": 1.4839948783610755, "grad_norm": 0.22059662092489835, "learning_rate": 1.55446010753392e-06, "loss": 0.0834, "step": 1159 }, { "epoch": 1.4852752880921896, "grad_norm": 0.20559827796663344, "learning_rate": 1.5471796858156951e-06, "loss": 0.0675, "step": 1160 }, { "epoch": 1.4865556978233034, "grad_norm": 0.21240051694897266, "learning_rate": 1.5399132313671328e-06, "loss": 0.0722, "step": 1161 }, { "epoch": 1.4878361075544175, "grad_norm": 0.18508727590635868, "learning_rate": 1.5326607735823284e-06, "loss": 0.0555, "step": 1162 }, { "epoch": 1.4891165172855314, "grad_norm": 0.19891751423921492, "learning_rate": 1.5254223417987568e-06, "loss": 0.0664, "step": 1163 }, { "epoch": 1.4903969270166453, "grad_norm": 0.20359227206167926, "learning_rate": 1.5181979652971579e-06, "loss": 0.0676, "step": 1164 }, { "epoch": 1.4916773367477592, "grad_norm": 0.2060213715124287, "learning_rate": 1.5109876733014123e-06, "loss": 0.063, "step": 1165 }, { "epoch": 1.4929577464788732, "grad_norm": 0.19767456296323482, "learning_rate": 1.50379149497843e-06, "loss": 0.0606, "step": 1166 }, { "epoch": 1.4942381562099871, "grad_norm": 0.19138596252119403, "learning_rate": 1.4966094594380258e-06, "loss": 0.0521, "step": 1167 }, { "epoch": 1.4955185659411012, "grad_norm": 0.20796055923731488, "learning_rate": 1.489441595732804e-06, "loss": 0.0664, "step": 1168 }, { "epoch": 1.496798975672215, "grad_norm": 0.2085574195409086, "learning_rate": 1.4822879328580453e-06, "loss": 0.07, "step": 1169 }, { "epoch": 1.498079385403329, "grad_norm": 0.20147140042309697, "learning_rate": 1.475148499751582e-06, "loss": 0.0692, "step": 1170 }, { "epoch": 1.499359795134443, "grad_norm": 0.19521442530197616, "learning_rate": 1.4680233252936832e-06, "loss": 0.0621, "step": 1171 }, { "epoch": 1.500640204865557, "grad_norm": 0.20012232964788193, "learning_rate": 1.4609124383069434e-06, "loss": 0.0705, "step": 1172 }, { "epoch": 1.501920614596671, "grad_norm": 0.2227548170744679, "learning_rate": 1.4538158675561597e-06, "loss": 0.0808, "step": 1173 }, { "epoch": 1.503201024327785, "grad_norm": 0.19447445370507122, "learning_rate": 1.4467336417482198e-06, "loss": 0.0469, "step": 1174 }, { "epoch": 1.5044814340588988, "grad_norm": 0.200433779191173, "learning_rate": 1.4396657895319782e-06, "loss": 0.0585, "step": 1175 }, { "epoch": 1.5057618437900127, "grad_norm": 0.21362410880580374, "learning_rate": 1.4326123394981484e-06, "loss": 0.0786, "step": 1176 }, { "epoch": 1.5070422535211268, "grad_norm": 0.2122735225638961, "learning_rate": 1.4255733201791883e-06, "loss": 0.0708, "step": 1177 }, { "epoch": 1.5083226632522408, "grad_norm": 0.22209267690116122, "learning_rate": 1.4185487600491755e-06, "loss": 0.0828, "step": 1178 }, { "epoch": 1.5096030729833547, "grad_norm": 0.18237290234580011, "learning_rate": 1.411538687523698e-06, "loss": 0.0535, "step": 1179 }, { "epoch": 1.5108834827144686, "grad_norm": 0.20519085933241782, "learning_rate": 1.4045431309597447e-06, "loss": 0.0814, "step": 1180 }, { "epoch": 1.5121638924455825, "grad_norm": 0.2149799116874972, "learning_rate": 1.3975621186555766e-06, "loss": 0.0755, "step": 1181 }, { "epoch": 1.5134443021766966, "grad_norm": 0.20570639601962545, "learning_rate": 1.3905956788506287e-06, "loss": 0.0635, "step": 1182 }, { "epoch": 1.5147247119078107, "grad_norm": 0.19154639514247507, "learning_rate": 1.3836438397253805e-06, "loss": 0.0602, "step": 1183 }, { "epoch": 1.5160051216389245, "grad_norm": 0.19089793829107424, "learning_rate": 1.3767066294012532e-06, "loss": 0.0554, "step": 1184 }, { "epoch": 1.5172855313700384, "grad_norm": 0.19725904149301007, "learning_rate": 1.369784075940494e-06, "loss": 0.0568, "step": 1185 }, { "epoch": 1.5185659411011523, "grad_norm": 0.21734724214079665, "learning_rate": 1.362876207346055e-06, "loss": 0.0749, "step": 1186 }, { "epoch": 1.5198463508322664, "grad_norm": 0.20348966370723365, "learning_rate": 1.3559830515614874e-06, "loss": 0.0705, "step": 1187 }, { "epoch": 1.5211267605633803, "grad_norm": 0.18819348353754645, "learning_rate": 1.3491046364708294e-06, "loss": 0.0593, "step": 1188 }, { "epoch": 1.5224071702944943, "grad_norm": 0.1980918684988977, "learning_rate": 1.3422409898984872e-06, "loss": 0.0581, "step": 1189 }, { "epoch": 1.5236875800256082, "grad_norm": 0.2196196088720019, "learning_rate": 1.3353921396091252e-06, "loss": 0.0792, "step": 1190 }, { "epoch": 1.524967989756722, "grad_norm": 0.1937948057698019, "learning_rate": 1.3285581133075582e-06, "loss": 0.0577, "step": 1191 }, { "epoch": 1.526248399487836, "grad_norm": 0.19932069535502206, "learning_rate": 1.3217389386386304e-06, "loss": 0.0649, "step": 1192 }, { "epoch": 1.52752880921895, "grad_norm": 0.20172482425870408, "learning_rate": 1.3149346431871118e-06, "loss": 0.0584, "step": 1193 }, { "epoch": 1.5288092189500642, "grad_norm": 0.3375700749509525, "learning_rate": 1.3081452544775852e-06, "loss": 0.0788, "step": 1194 }, { "epoch": 1.530089628681178, "grad_norm": 0.20768264895403132, "learning_rate": 1.301370799974327e-06, "loss": 0.0634, "step": 1195 }, { "epoch": 1.531370038412292, "grad_norm": 0.20614117723993183, "learning_rate": 1.2946113070812073e-06, "loss": 0.0706, "step": 1196 }, { "epoch": 1.5326504481434058, "grad_norm": 0.193154828083668, "learning_rate": 1.287866803141572e-06, "loss": 0.0579, "step": 1197 }, { "epoch": 1.5339308578745199, "grad_norm": 0.2236138497794174, "learning_rate": 1.2811373154381312e-06, "loss": 0.077, "step": 1198 }, { "epoch": 1.5352112676056338, "grad_norm": 0.20899308806866881, "learning_rate": 1.2744228711928585e-06, "loss": 0.0555, "step": 1199 }, { "epoch": 1.5364916773367479, "grad_norm": 0.20188979991160919, "learning_rate": 1.2677234975668662e-06, "loss": 0.0675, "step": 1200 }, { "epoch": 1.5364916773367479, "eval_loss": 0.08611313253641129, "eval_runtime": 10.9235, "eval_samples_per_second": 23.161, "eval_steps_per_second": 5.859, "step": 1200 }, { "epoch": 1.5377720870678617, "grad_norm": 0.20042871440553123, "learning_rate": 1.2610392216603106e-06, "loss": 0.0642, "step": 1201 }, { "epoch": 1.5390524967989756, "grad_norm": 0.19488554500162558, "learning_rate": 1.2543700705122697e-06, "loss": 0.0677, "step": 1202 }, { "epoch": 1.5403329065300895, "grad_norm": 0.20783053650260008, "learning_rate": 1.2477160711006397e-06, "loss": 0.0688, "step": 1203 }, { "epoch": 1.5416133162612036, "grad_norm": 0.20698170162955803, "learning_rate": 1.2410772503420276e-06, "loss": 0.0731, "step": 1204 }, { "epoch": 1.5428937259923177, "grad_norm": 0.20321172587614342, "learning_rate": 1.2344536350916414e-06, "loss": 0.0707, "step": 1205 }, { "epoch": 1.5441741357234315, "grad_norm": 0.18986854934698014, "learning_rate": 1.2278452521431744e-06, "loss": 0.0509, "step": 1206 }, { "epoch": 1.5454545454545454, "grad_norm": 0.20341705087761094, "learning_rate": 1.2212521282287093e-06, "loss": 0.0687, "step": 1207 }, { "epoch": 1.5467349551856593, "grad_norm": 0.1982377029607459, "learning_rate": 1.2146742900185965e-06, "loss": 0.0553, "step": 1208 }, { "epoch": 1.5480153649167734, "grad_norm": 0.41670153578985175, "learning_rate": 1.208111764121359e-06, "loss": 0.0665, "step": 1209 }, { "epoch": 1.5492957746478875, "grad_norm": 0.22526000382589131, "learning_rate": 1.2015645770835765e-06, "loss": 0.0934, "step": 1210 }, { "epoch": 1.5505761843790014, "grad_norm": 0.1997902645377399, "learning_rate": 1.1950327553897767e-06, "loss": 0.0623, "step": 1211 }, { "epoch": 1.5518565941101152, "grad_norm": 0.1921870617685992, "learning_rate": 1.1885163254623393e-06, "loss": 0.0582, "step": 1212 }, { "epoch": 1.553137003841229, "grad_norm": 0.20908352513641384, "learning_rate": 1.1820153136613744e-06, "loss": 0.0768, "step": 1213 }, { "epoch": 1.5544174135723432, "grad_norm": 0.2104635241831041, "learning_rate": 1.1755297462846265e-06, "loss": 0.0624, "step": 1214 }, { "epoch": 1.555697823303457, "grad_norm": 0.21328033080149544, "learning_rate": 1.1690596495673672e-06, "loss": 0.0788, "step": 1215 }, { "epoch": 1.5569782330345712, "grad_norm": 0.2083041842694529, "learning_rate": 1.1626050496822794e-06, "loss": 0.0657, "step": 1216 }, { "epoch": 1.558258642765685, "grad_norm": 0.21288973478148915, "learning_rate": 1.156165972739366e-06, "loss": 0.0666, "step": 1217 }, { "epoch": 1.559539052496799, "grad_norm": 0.2065327023903655, "learning_rate": 1.1497424447858325e-06, "loss": 0.0739, "step": 1218 }, { "epoch": 1.5608194622279128, "grad_norm": 0.1994973821833753, "learning_rate": 1.1433344918059853e-06, "loss": 0.0636, "step": 1219 }, { "epoch": 1.5620998719590269, "grad_norm": 0.21333207081961592, "learning_rate": 1.1369421397211316e-06, "loss": 0.0771, "step": 1220 }, { "epoch": 1.563380281690141, "grad_norm": 0.1987265435065017, "learning_rate": 1.1305654143894674e-06, "loss": 0.0704, "step": 1221 }, { "epoch": 1.5646606914212549, "grad_norm": 0.215720295750172, "learning_rate": 1.1242043416059735e-06, "loss": 0.0648, "step": 1222 }, { "epoch": 1.5659411011523687, "grad_norm": 0.2081419233362204, "learning_rate": 1.1178589471023182e-06, "loss": 0.0635, "step": 1223 }, { "epoch": 1.5672215108834826, "grad_norm": 0.22571625396451214, "learning_rate": 1.111529256546745e-06, "loss": 0.0796, "step": 1224 }, { "epoch": 1.5685019206145967, "grad_norm": 0.21069234451640503, "learning_rate": 1.1052152955439732e-06, "loss": 0.0689, "step": 1225 }, { "epoch": 1.5697823303457106, "grad_norm": 0.21739573489227607, "learning_rate": 1.0989170896350947e-06, "loss": 0.0748, "step": 1226 }, { "epoch": 1.5710627400768247, "grad_norm": 0.20625126760996174, "learning_rate": 1.0926346642974656e-06, "loss": 0.075, "step": 1227 }, { "epoch": 1.5723431498079385, "grad_norm": 0.19222924738931604, "learning_rate": 1.0863680449446111e-06, "loss": 0.0586, "step": 1228 }, { "epoch": 1.5736235595390524, "grad_norm": 0.2026991433069211, "learning_rate": 1.080117256926116e-06, "loss": 0.0616, "step": 1229 }, { "epoch": 1.5749039692701663, "grad_norm": 0.2764326379765515, "learning_rate": 1.0738823255275227e-06, "loss": 0.0881, "step": 1230 }, { "epoch": 1.5761843790012804, "grad_norm": 0.21631633807858094, "learning_rate": 1.0676632759702354e-06, "loss": 0.0924, "step": 1231 }, { "epoch": 1.5774647887323945, "grad_norm": 0.21463917716113898, "learning_rate": 1.0614601334114099e-06, "loss": 0.069, "step": 1232 }, { "epoch": 1.5787451984635084, "grad_norm": 0.20666863327609916, "learning_rate": 1.0552729229438553e-06, "loss": 0.0744, "step": 1233 }, { "epoch": 1.5800256081946222, "grad_norm": 0.19108607455468793, "learning_rate": 1.049101669595935e-06, "loss": 0.0549, "step": 1234 }, { "epoch": 1.581306017925736, "grad_norm": 0.20556832867384014, "learning_rate": 1.042946398331461e-06, "loss": 0.063, "step": 1235 }, { "epoch": 1.5825864276568502, "grad_norm": 0.19557065221025682, "learning_rate": 1.0368071340495978e-06, "loss": 0.0645, "step": 1236 }, { "epoch": 1.5838668373879643, "grad_norm": 0.20816613340890566, "learning_rate": 1.0306839015847552e-06, "loss": 0.0712, "step": 1237 }, { "epoch": 1.5851472471190782, "grad_norm": 0.20409575978544475, "learning_rate": 1.0245767257064914e-06, "loss": 0.0622, "step": 1238 }, { "epoch": 1.586427656850192, "grad_norm": 0.1876429861409003, "learning_rate": 1.018485631119417e-06, "loss": 0.0559, "step": 1239 }, { "epoch": 1.587708066581306, "grad_norm": 0.211106775693684, "learning_rate": 1.012410642463087e-06, "loss": 0.074, "step": 1240 }, { "epoch": 1.58898847631242, "grad_norm": 0.2140586605265203, "learning_rate": 1.006351784311906e-06, "loss": 0.0728, "step": 1241 }, { "epoch": 1.590268886043534, "grad_norm": 0.20031418319365893, "learning_rate": 1.0003090811750294e-06, "loss": 0.0688, "step": 1242 }, { "epoch": 1.591549295774648, "grad_norm": 0.20813906675090527, "learning_rate": 9.942825574962595e-07, "loss": 0.0633, "step": 1243 }, { "epoch": 1.5928297055057619, "grad_norm": 0.19103123134818825, "learning_rate": 9.882722376539549e-07, "loss": 0.0595, "step": 1244 }, { "epoch": 1.5941101152368757, "grad_norm": 0.20762738384116486, "learning_rate": 9.822781459609209e-07, "loss": 0.056, "step": 1245 }, { "epoch": 1.5953905249679896, "grad_norm": 0.19736873678100877, "learning_rate": 9.76300306664321e-07, "loss": 0.0653, "step": 1246 }, { "epoch": 1.5966709346991037, "grad_norm": 0.19522105843965765, "learning_rate": 9.703387439455758e-07, "loss": 0.068, "step": 1247 }, { "epoch": 1.5979513444302178, "grad_norm": 0.2016181819439628, "learning_rate": 9.643934819202604e-07, "loss": 0.0637, "step": 1248 }, { "epoch": 1.5992317541613317, "grad_norm": 0.187792718163661, "learning_rate": 9.584645446380114e-07, "loss": 0.0617, "step": 1249 }, { "epoch": 1.6005121638924455, "grad_norm": 0.2054432465847264, "learning_rate": 9.525519560824326e-07, "loss": 0.0733, "step": 1250 }, { "epoch": 1.6017925736235594, "grad_norm": 0.21350113646375155, "learning_rate": 9.466557401709892e-07, "loss": 0.0784, "step": 1251 }, { "epoch": 1.6030729833546735, "grad_norm": 0.2099857149278394, "learning_rate": 9.407759207549217e-07, "loss": 0.076, "step": 1252 }, { "epoch": 1.6043533930857876, "grad_norm": 0.1913521115674782, "learning_rate": 9.349125216191396e-07, "loss": 0.0561, "step": 1253 }, { "epoch": 1.6056338028169015, "grad_norm": 0.19297019970472332, "learning_rate": 9.290655664821296e-07, "loss": 0.0547, "step": 1254 }, { "epoch": 1.6069142125480154, "grad_norm": 0.19220976280460547, "learning_rate": 9.232350789958616e-07, "loss": 0.0586, "step": 1255 }, { "epoch": 1.6081946222791292, "grad_norm": 0.20455091308186316, "learning_rate": 9.174210827456914e-07, "loss": 0.0719, "step": 1256 }, { "epoch": 1.6094750320102431, "grad_norm": 0.20460767010770822, "learning_rate": 9.11623601250261e-07, "loss": 0.0669, "step": 1257 }, { "epoch": 1.6107554417413572, "grad_norm": 0.20561717851585962, "learning_rate": 9.058426579614127e-07, "loss": 0.0665, "step": 1258 }, { "epoch": 1.6120358514724713, "grad_norm": 0.1932394039205558, "learning_rate": 9.000782762640842e-07, "loss": 0.0596, "step": 1259 }, { "epoch": 1.6133162612035852, "grad_norm": 0.20248938320932783, "learning_rate": 8.943304794762192e-07, "loss": 0.064, "step": 1260 }, { "epoch": 1.614596670934699, "grad_norm": 0.1758371919921657, "learning_rate": 8.885992908486762e-07, "loss": 0.0452, "step": 1261 }, { "epoch": 1.615877080665813, "grad_norm": 0.21962459681935562, "learning_rate": 8.828847335651258e-07, "loss": 0.0762, "step": 1262 }, { "epoch": 1.617157490396927, "grad_norm": 0.21435850606339485, "learning_rate": 8.771868307419668e-07, "loss": 0.0757, "step": 1263 }, { "epoch": 1.6184379001280411, "grad_norm": 0.20605343265604814, "learning_rate": 8.715056054282234e-07, "loss": 0.0647, "step": 1264 }, { "epoch": 1.619718309859155, "grad_norm": 0.18940832109878186, "learning_rate": 8.658410806054568e-07, "loss": 0.0594, "step": 1265 }, { "epoch": 1.6209987195902689, "grad_norm": 0.1911889279048385, "learning_rate": 8.601932791876755e-07, "loss": 0.0596, "step": 1266 }, { "epoch": 1.6222791293213827, "grad_norm": 0.2107569616024212, "learning_rate": 8.54562224021237e-07, "loss": 0.0815, "step": 1267 }, { "epoch": 1.6235595390524968, "grad_norm": 0.20138591415600265, "learning_rate": 8.489479378847537e-07, "loss": 0.0667, "step": 1268 }, { "epoch": 1.6248399487836107, "grad_norm": 0.22420070410250878, "learning_rate": 8.433504434890105e-07, "loss": 0.0815, "step": 1269 }, { "epoch": 1.6261203585147248, "grad_norm": 0.1912107765302422, "learning_rate": 8.377697634768611e-07, "loss": 0.0622, "step": 1270 }, { "epoch": 1.6274007682458387, "grad_norm": 0.20097014870433733, "learning_rate": 8.322059204231464e-07, "loss": 0.0625, "step": 1271 }, { "epoch": 1.6286811779769526, "grad_norm": 0.20743703185530196, "learning_rate": 8.266589368345956e-07, "loss": 0.0721, "step": 1272 }, { "epoch": 1.6299615877080664, "grad_norm": 0.19757943193828886, "learning_rate": 8.211288351497398e-07, "loss": 0.0603, "step": 1273 }, { "epoch": 1.6312419974391805, "grad_norm": 0.19800780226016565, "learning_rate": 8.156156377388202e-07, "loss": 0.0616, "step": 1274 }, { "epoch": 1.6325224071702946, "grad_norm": 0.20370802796706292, "learning_rate": 8.101193669036961e-07, "loss": 0.0607, "step": 1275 }, { "epoch": 1.6338028169014085, "grad_norm": 0.21678643362521716, "learning_rate": 8.046400448777575e-07, "loss": 0.0798, "step": 1276 }, { "epoch": 1.6350832266325224, "grad_norm": 0.19715683954123955, "learning_rate": 7.991776938258305e-07, "loss": 0.0566, "step": 1277 }, { "epoch": 1.6363636363636362, "grad_norm": 0.2046696487671033, "learning_rate": 7.937323358440935e-07, "loss": 0.0735, "step": 1278 }, { "epoch": 1.6376440460947503, "grad_norm": 0.2188498994592189, "learning_rate": 7.883039929599857e-07, "loss": 0.0801, "step": 1279 }, { "epoch": 1.6389244558258644, "grad_norm": 0.19420927570752983, "learning_rate": 7.828926871321135e-07, "loss": 0.057, "step": 1280 }, { "epoch": 1.6402048655569783, "grad_norm": 0.22772540164942126, "learning_rate": 7.774984402501662e-07, "loss": 0.0843, "step": 1281 }, { "epoch": 1.6414852752880922, "grad_norm": 0.2003661001936283, "learning_rate": 7.721212741348305e-07, "loss": 0.0568, "step": 1282 }, { "epoch": 1.642765685019206, "grad_norm": 0.19321405783173748, "learning_rate": 7.667612105376937e-07, "loss": 0.0593, "step": 1283 }, { "epoch": 1.6440460947503202, "grad_norm": 0.20681530892425667, "learning_rate": 7.614182711411622e-07, "loss": 0.072, "step": 1284 }, { "epoch": 1.645326504481434, "grad_norm": 0.20076071704550877, "learning_rate": 7.560924775583739e-07, "loss": 0.0643, "step": 1285 }, { "epoch": 1.6466069142125481, "grad_norm": 0.18987488242194264, "learning_rate": 7.507838513331051e-07, "loss": 0.0577, "step": 1286 }, { "epoch": 1.647887323943662, "grad_norm": 0.19482028461561646, "learning_rate": 7.45492413939689e-07, "loss": 0.0535, "step": 1287 }, { "epoch": 1.6491677336747759, "grad_norm": 0.19882380782330508, "learning_rate": 7.402181867829294e-07, "loss": 0.0705, "step": 1288 }, { "epoch": 1.6504481434058897, "grad_norm": 0.20731804232464343, "learning_rate": 7.349611911980064e-07, "loss": 0.0648, "step": 1289 }, { "epoch": 1.6517285531370038, "grad_norm": 0.21991391241691002, "learning_rate": 7.297214484504006e-07, "loss": 0.0813, "step": 1290 }, { "epoch": 1.653008962868118, "grad_norm": 0.20030869826934491, "learning_rate": 7.244989797357982e-07, "loss": 0.0636, "step": 1291 }, { "epoch": 1.6542893725992318, "grad_norm": 0.20808543559803042, "learning_rate": 7.192938061800081e-07, "loss": 0.071, "step": 1292 }, { "epoch": 1.6555697823303457, "grad_norm": 0.18748165419320376, "learning_rate": 7.14105948838883e-07, "loss": 0.0532, "step": 1293 }, { "epoch": 1.6568501920614596, "grad_norm": 0.20469704413675108, "learning_rate": 7.089354286982219e-07, "loss": 0.0666, "step": 1294 }, { "epoch": 1.6581306017925737, "grad_norm": 0.20601126167551978, "learning_rate": 7.037822666736948e-07, "loss": 0.0701, "step": 1295 }, { "epoch": 1.6594110115236875, "grad_norm": 0.19743190825191756, "learning_rate": 6.986464836107548e-07, "loss": 0.0598, "step": 1296 }, { "epoch": 1.6606914212548016, "grad_norm": 0.20104828319487214, "learning_rate": 6.935281002845551e-07, "loss": 0.0614, "step": 1297 }, { "epoch": 1.6619718309859155, "grad_norm": 0.23711796771781415, "learning_rate": 6.884271373998608e-07, "loss": 0.0536, "step": 1298 }, { "epoch": 1.6632522407170294, "grad_norm": 0.1925756788419184, "learning_rate": 6.833436155909729e-07, "loss": 0.0608, "step": 1299 }, { "epoch": 1.6645326504481432, "grad_norm": 0.19819200908428286, "learning_rate": 6.782775554216337e-07, "loss": 0.0609, "step": 1300 }, { "epoch": 1.6658130601792573, "grad_norm": 0.19886503996795452, "learning_rate": 6.732289773849577e-07, "loss": 0.0619, "step": 1301 }, { "epoch": 1.6670934699103714, "grad_norm": 0.19950368361491885, "learning_rate": 6.681979019033346e-07, "loss": 0.0599, "step": 1302 }, { "epoch": 1.6683738796414853, "grad_norm": 0.18896145085502247, "learning_rate": 6.631843493283552e-07, "loss": 0.0493, "step": 1303 }, { "epoch": 1.6696542893725992, "grad_norm": 0.20721021612349017, "learning_rate": 6.581883399407302e-07, "loss": 0.0666, "step": 1304 }, { "epoch": 1.670934699103713, "grad_norm": 0.19182803570935616, "learning_rate": 6.532098939501996e-07, "loss": 0.057, "step": 1305 }, { "epoch": 1.6722151088348272, "grad_norm": 0.19673076628262065, "learning_rate": 6.48249031495462e-07, "loss": 0.0632, "step": 1306 }, { "epoch": 1.6734955185659413, "grad_norm": 0.2116210980970092, "learning_rate": 6.433057726440811e-07, "loss": 0.069, "step": 1307 }, { "epoch": 1.6747759282970551, "grad_norm": 0.1914436002827368, "learning_rate": 6.38380137392417e-07, "loss": 0.0546, "step": 1308 }, { "epoch": 1.676056338028169, "grad_norm": 0.20205759119968134, "learning_rate": 6.334721456655363e-07, "loss": 0.0695, "step": 1309 }, { "epoch": 1.6773367477592829, "grad_norm": 0.20040314949627408, "learning_rate": 6.285818173171349e-07, "loss": 0.0642, "step": 1310 }, { "epoch": 1.678617157490397, "grad_norm": 0.20087539803182428, "learning_rate": 6.237091721294547e-07, "loss": 0.0678, "step": 1311 }, { "epoch": 1.6798975672215108, "grad_norm": 0.19588423552870884, "learning_rate": 6.188542298132122e-07, "loss": 0.0651, "step": 1312 }, { "epoch": 1.681177976952625, "grad_norm": 0.18471061326578725, "learning_rate": 6.140170100075049e-07, "loss": 0.0526, "step": 1313 }, { "epoch": 1.6824583866837388, "grad_norm": 0.21437615834149437, "learning_rate": 6.091975322797472e-07, "loss": 0.0703, "step": 1314 }, { "epoch": 1.6837387964148527, "grad_norm": 0.19420277235500474, "learning_rate": 6.043958161255781e-07, "loss": 0.0563, "step": 1315 }, { "epoch": 1.6850192061459666, "grad_norm": 0.18855621090023875, "learning_rate": 5.996118809687895e-07, "loss": 0.0553, "step": 1316 }, { "epoch": 1.6862996158770807, "grad_norm": 0.193826879822222, "learning_rate": 5.948457461612478e-07, "loss": 0.0635, "step": 1317 }, { "epoch": 1.6875800256081948, "grad_norm": 0.20768744844990208, "learning_rate": 5.900974309828101e-07, "loss": 0.0706, "step": 1318 }, { "epoch": 1.6888604353393086, "grad_norm": 0.19999584366902196, "learning_rate": 5.853669546412538e-07, "loss": 0.0591, "step": 1319 }, { "epoch": 1.6901408450704225, "grad_norm": 0.18755720922101884, "learning_rate": 5.806543362721945e-07, "loss": 0.0514, "step": 1320 }, { "epoch": 1.6914212548015364, "grad_norm": 0.19613282726092443, "learning_rate": 5.759595949390063e-07, "loss": 0.0624, "step": 1321 }, { "epoch": 1.6927016645326505, "grad_norm": 0.19353306996926475, "learning_rate": 5.7128274963275e-07, "loss": 0.0603, "step": 1322 }, { "epoch": 1.6939820742637646, "grad_norm": 0.20351508876244323, "learning_rate": 5.666238192720941e-07, "loss": 0.0567, "step": 1323 }, { "epoch": 1.6952624839948784, "grad_norm": 0.2048569598683432, "learning_rate": 5.619828227032342e-07, "loss": 0.0579, "step": 1324 }, { "epoch": 1.6965428937259923, "grad_norm": 0.19350302731539581, "learning_rate": 5.573597786998264e-07, "loss": 0.0639, "step": 1325 }, { "epoch": 1.6978233034571062, "grad_norm": 0.20670903133608162, "learning_rate": 5.527547059629013e-07, "loss": 0.0679, "step": 1326 }, { "epoch": 1.69910371318822, "grad_norm": 0.2017429134775229, "learning_rate": 5.481676231207922e-07, "loss": 0.0663, "step": 1327 }, { "epoch": 1.7003841229193342, "grad_norm": 0.19458035334417656, "learning_rate": 5.43598548729063e-07, "loss": 0.0683, "step": 1328 }, { "epoch": 1.7016645326504483, "grad_norm": 0.19466757828603515, "learning_rate": 5.390475012704293e-07, "loss": 0.057, "step": 1329 }, { "epoch": 1.7029449423815621, "grad_norm": 0.20642784982580575, "learning_rate": 5.34514499154683e-07, "loss": 0.0672, "step": 1330 }, { "epoch": 1.704225352112676, "grad_norm": 0.2113973098827549, "learning_rate": 5.29999560718622e-07, "loss": 0.0719, "step": 1331 }, { "epoch": 1.7055057618437899, "grad_norm": 0.20302275997305902, "learning_rate": 5.255027042259692e-07, "loss": 0.0634, "step": 1332 }, { "epoch": 1.706786171574904, "grad_norm": 0.19648816412155917, "learning_rate": 5.210239478673085e-07, "loss": 0.0568, "step": 1333 }, { "epoch": 1.708066581306018, "grad_norm": 0.20163681430016664, "learning_rate": 5.165633097600004e-07, "loss": 0.0657, "step": 1334 }, { "epoch": 1.709346991037132, "grad_norm": 0.21327403540952428, "learning_rate": 5.121208079481166e-07, "loss": 0.0732, "step": 1335 }, { "epoch": 1.7106274007682458, "grad_norm": 0.1980561424451561, "learning_rate": 5.076964604023644e-07, "loss": 0.0586, "step": 1336 }, { "epoch": 1.7119078104993597, "grad_norm": 0.1999534932780532, "learning_rate": 5.032902850200122e-07, "loss": 0.067, "step": 1337 }, { "epoch": 1.7131882202304738, "grad_norm": 0.19929801193180138, "learning_rate": 4.989022996248194e-07, "loss": 0.067, "step": 1338 }, { "epoch": 1.7144686299615877, "grad_norm": 0.19523931547158183, "learning_rate": 4.94532521966965e-07, "loss": 0.0633, "step": 1339 }, { "epoch": 1.7157490396927018, "grad_norm": 0.21602586500213825, "learning_rate": 4.901809697229731e-07, "loss": 0.061, "step": 1340 }, { "epoch": 1.7170294494238156, "grad_norm": 0.19547419575092737, "learning_rate": 4.858476604956447e-07, "loss": 0.0646, "step": 1341 }, { "epoch": 1.7183098591549295, "grad_norm": 0.20740975374011206, "learning_rate": 4.815326118139813e-07, "loss": 0.0666, "step": 1342 }, { "epoch": 1.7195902688860434, "grad_norm": 0.20043931493714992, "learning_rate": 4.772358411331185e-07, "loss": 0.0617, "step": 1343 }, { "epoch": 1.7208706786171575, "grad_norm": 0.2250274560554516, "learning_rate": 4.7295736583425624e-07, "loss": 0.0733, "step": 1344 }, { "epoch": 1.7221510883482716, "grad_norm": 0.19949655278269196, "learning_rate": 4.6869720322458457e-07, "loss": 0.0636, "step": 1345 }, { "epoch": 1.7234314980793854, "grad_norm": 0.22384989767310934, "learning_rate": 4.6445537053721403e-07, "loss": 0.0672, "step": 1346 }, { "epoch": 1.7247119078104993, "grad_norm": 0.21843068193340365, "learning_rate": 4.602318849311116e-07, "loss": 0.0528, "step": 1347 }, { "epoch": 1.7259923175416132, "grad_norm": 0.2154837527596101, "learning_rate": 4.5602676349102203e-07, "loss": 0.0765, "step": 1348 }, { "epoch": 1.7272727272727273, "grad_norm": 0.20812229327123935, "learning_rate": 4.5184002322740784e-07, "loss": 0.0737, "step": 1349 }, { "epoch": 1.7285531370038414, "grad_norm": 0.18550126434105613, "learning_rate": 4.4767168107637537e-07, "loss": 0.0498, "step": 1350 }, { "epoch": 1.7298335467349553, "grad_norm": 0.20508279952656905, "learning_rate": 4.4352175389960505e-07, "loss": 0.0728, "step": 1351 }, { "epoch": 1.7311139564660691, "grad_norm": 0.2216209986509392, "learning_rate": 4.393902584842891e-07, "loss": 0.0828, "step": 1352 }, { "epoch": 1.732394366197183, "grad_norm": 0.19372857221832493, "learning_rate": 4.3527721154305703e-07, "loss": 0.0524, "step": 1353 }, { "epoch": 1.733674775928297, "grad_norm": 0.1877031333440656, "learning_rate": 4.3118262971391154e-07, "loss": 0.0554, "step": 1354 }, { "epoch": 1.734955185659411, "grad_norm": 0.19047204572465343, "learning_rate": 4.2710652956016287e-07, "loss": 0.0577, "step": 1355 }, { "epoch": 1.736235595390525, "grad_norm": 0.20168459183585527, "learning_rate": 4.230489275703564e-07, "loss": 0.07, "step": 1356 }, { "epoch": 1.737516005121639, "grad_norm": 0.18175171334497253, "learning_rate": 4.1900984015821267e-07, "loss": 0.0485, "step": 1357 }, { "epoch": 1.7387964148527528, "grad_norm": 0.20903586660942647, "learning_rate": 4.1498928366255466e-07, "loss": 0.0713, "step": 1358 }, { "epoch": 1.7400768245838667, "grad_norm": 0.243644449195975, "learning_rate": 4.109872743472443e-07, "loss": 0.07, "step": 1359 }, { "epoch": 1.7413572343149808, "grad_norm": 0.18201840211856285, "learning_rate": 4.0700382840111906e-07, "loss": 0.0599, "step": 1360 }, { "epoch": 1.742637644046095, "grad_norm": 0.20451956578853434, "learning_rate": 4.030389619379238e-07, "loss": 0.0623, "step": 1361 }, { "epoch": 1.7439180537772088, "grad_norm": 0.2188512950956738, "learning_rate": 3.990926909962445e-07, "loss": 0.0771, "step": 1362 }, { "epoch": 1.7451984635083226, "grad_norm": 0.2324903372095307, "learning_rate": 3.9516503153944673e-07, "loss": 0.0673, "step": 1363 }, { "epoch": 1.7464788732394365, "grad_norm": 0.1977160316868668, "learning_rate": 3.9125599945560866e-07, "loss": 0.0646, "step": 1364 }, { "epoch": 1.7477592829705506, "grad_norm": 0.1989584162969946, "learning_rate": 3.873656105574564e-07, "loss": 0.0632, "step": 1365 }, { "epoch": 1.7490396927016645, "grad_norm": 0.20098521158958274, "learning_rate": 3.8349388058230296e-07, "loss": 0.062, "step": 1366 }, { "epoch": 1.7503201024327786, "grad_norm": 0.19597027954610385, "learning_rate": 3.7964082519198087e-07, "loss": 0.0607, "step": 1367 }, { "epoch": 1.7516005121638925, "grad_norm": 0.20108728042860027, "learning_rate": 3.7580645997278287e-07, "loss": 0.059, "step": 1368 }, { "epoch": 1.7528809218950063, "grad_norm": 0.1921412861492757, "learning_rate": 3.719908004353939e-07, "loss": 0.0521, "step": 1369 }, { "epoch": 1.7541613316261202, "grad_norm": 0.2139820014475891, "learning_rate": 3.68193862014834e-07, "loss": 0.0817, "step": 1370 }, { "epoch": 1.7554417413572343, "grad_norm": 0.1982463479033246, "learning_rate": 3.6441566007038967e-07, "loss": 0.0708, "step": 1371 }, { "epoch": 1.7567221510883484, "grad_norm": 0.19654029086189598, "learning_rate": 3.606562098855587e-07, "loss": 0.0599, "step": 1372 }, { "epoch": 1.7580025608194623, "grad_norm": 0.2133187626686813, "learning_rate": 3.569155266679819e-07, "loss": 0.0772, "step": 1373 }, { "epoch": 1.7592829705505761, "grad_norm": 0.18362124759951692, "learning_rate": 3.5319362554938564e-07, "loss": 0.0502, "step": 1374 }, { "epoch": 1.76056338028169, "grad_norm": 0.192440132324627, "learning_rate": 3.4949052158551875e-07, "loss": 0.0616, "step": 1375 }, { "epoch": 1.7618437900128041, "grad_norm": 0.19430811354932426, "learning_rate": 3.4580622975609377e-07, "loss": 0.0592, "step": 1376 }, { "epoch": 1.7631241997439182, "grad_norm": 0.220109677351347, "learning_rate": 3.421407649647224e-07, "loss": 0.0781, "step": 1377 }, { "epoch": 1.764404609475032, "grad_norm": 0.20049853783584656, "learning_rate": 3.384941420388588e-07, "loss": 0.065, "step": 1378 }, { "epoch": 1.765685019206146, "grad_norm": 0.20139658895217075, "learning_rate": 3.3486637572973933e-07, "loss": 0.0633, "step": 1379 }, { "epoch": 1.7669654289372598, "grad_norm": 0.21063529236680298, "learning_rate": 3.3125748071232033e-07, "loss": 0.0789, "step": 1380 }, { "epoch": 1.768245838668374, "grad_norm": 0.21296854789192277, "learning_rate": 3.2766747158522126e-07, "loss": 0.0754, "step": 1381 }, { "epoch": 1.7695262483994878, "grad_norm": 0.1916752967201772, "learning_rate": 3.240963628706667e-07, "loss": 0.0559, "step": 1382 }, { "epoch": 1.770806658130602, "grad_norm": 0.2060524254586167, "learning_rate": 3.205441690144212e-07, "loss": 0.0722, "step": 1383 }, { "epoch": 1.7720870678617158, "grad_norm": 0.1973592566254937, "learning_rate": 3.1701090438574e-07, "loss": 0.0625, "step": 1384 }, { "epoch": 1.7733674775928296, "grad_norm": 0.20087273107907422, "learning_rate": 3.13496583277304e-07, "loss": 0.066, "step": 1385 }, { "epoch": 1.7746478873239435, "grad_norm": 0.21052814879565404, "learning_rate": 3.100012199051627e-07, "loss": 0.0783, "step": 1386 }, { "epoch": 1.7759282970550576, "grad_norm": 0.19196260784149885, "learning_rate": 3.065248284086819e-07, "loss": 0.0602, "step": 1387 }, { "epoch": 1.7772087067861717, "grad_norm": 0.19462154392446185, "learning_rate": 3.0306742285048095e-07, "loss": 0.0597, "step": 1388 }, { "epoch": 1.7784891165172856, "grad_norm": 0.20447192015331403, "learning_rate": 2.9962901721637517e-07, "loss": 0.0614, "step": 1389 }, { "epoch": 1.7797695262483995, "grad_norm": 0.19766467558190148, "learning_rate": 2.962096254153268e-07, "loss": 0.0624, "step": 1390 }, { "epoch": 1.7810499359795133, "grad_norm": 0.1806997675320888, "learning_rate": 2.928092612793809e-07, "loss": 0.0585, "step": 1391 }, { "epoch": 1.7823303457106274, "grad_norm": 0.19630629829030638, "learning_rate": 2.8942793856361117e-07, "loss": 0.0625, "step": 1392 }, { "epoch": 1.7836107554417413, "grad_norm": 0.212989521587174, "learning_rate": 2.860656709460685e-07, "loss": 0.0531, "step": 1393 }, { "epoch": 1.7848911651728554, "grad_norm": 0.2225818187353885, "learning_rate": 2.8272247202771995e-07, "loss": 0.0718, "step": 1394 }, { "epoch": 1.7861715749039693, "grad_norm": 0.2057154267684041, "learning_rate": 2.793983553323981e-07, "loss": 0.0651, "step": 1395 }, { "epoch": 1.7874519846350831, "grad_norm": 0.19916304976789587, "learning_rate": 2.7609333430674234e-07, "loss": 0.067, "step": 1396 }, { "epoch": 1.788732394366197, "grad_norm": 0.19336182430002904, "learning_rate": 2.728074223201488e-07, "loss": 0.0585, "step": 1397 }, { "epoch": 1.7900128040973111, "grad_norm": 0.1966954491833402, "learning_rate": 2.6954063266471374e-07, "loss": 0.0602, "step": 1398 }, { "epoch": 1.7912932138284252, "grad_norm": 0.18903248273814233, "learning_rate": 2.662929785551793e-07, "loss": 0.0602, "step": 1399 }, { "epoch": 1.792573623559539, "grad_norm": 0.21280053483476682, "learning_rate": 2.630644731288806e-07, "loss": 0.0678, "step": 1400 }, { "epoch": 1.792573623559539, "eval_loss": 0.08574853092432022, "eval_runtime": 10.9086, "eval_samples_per_second": 23.193, "eval_steps_per_second": 5.867, "step": 1400 }, { "epoch": 1.793854033290653, "grad_norm": 0.20437915500455855, "learning_rate": 2.5985512944569525e-07, "loss": 0.07, "step": 1401 }, { "epoch": 1.7951344430217668, "grad_norm": 0.19650991866521572, "learning_rate": 2.566649604879867e-07, "loss": 0.064, "step": 1402 }, { "epoch": 1.796414852752881, "grad_norm": 0.21662572079656986, "learning_rate": 2.534939791605534e-07, "loss": 0.0791, "step": 1403 }, { "epoch": 1.797695262483995, "grad_norm": 0.2002359166593617, "learning_rate": 2.503421982905768e-07, "loss": 0.0686, "step": 1404 }, { "epoch": 1.798975672215109, "grad_norm": 0.20066668968471985, "learning_rate": 2.4720963062756675e-07, "loss": 0.0651, "step": 1405 }, { "epoch": 1.8002560819462228, "grad_norm": 0.2268648431312511, "learning_rate": 2.440962888433163e-07, "loss": 0.0758, "step": 1406 }, { "epoch": 1.8015364916773366, "grad_norm": 0.19303296944215054, "learning_rate": 2.410021855318434e-07, "loss": 0.0658, "step": 1407 }, { "epoch": 1.8028169014084507, "grad_norm": 0.20411023491061306, "learning_rate": 2.3792733320934348e-07, "loss": 0.0572, "step": 1408 }, { "epoch": 1.8040973111395646, "grad_norm": 0.17373668352286573, "learning_rate": 2.3487174431414018e-07, "loss": 0.0464, "step": 1409 }, { "epoch": 1.8053777208706787, "grad_norm": 0.19938866386236145, "learning_rate": 2.3183543120662954e-07, "loss": 0.0678, "step": 1410 }, { "epoch": 1.8066581306017926, "grad_norm": 0.20042662117625965, "learning_rate": 2.2881840616923834e-07, "loss": 0.065, "step": 1411 }, { "epoch": 1.8079385403329065, "grad_norm": 0.19952594784436398, "learning_rate": 2.2582068140636514e-07, "loss": 0.0641, "step": 1412 }, { "epoch": 1.8092189500640203, "grad_norm": 0.2419969426190514, "learning_rate": 2.2284226904433826e-07, "loss": 0.0664, "step": 1413 }, { "epoch": 1.8104993597951344, "grad_norm": 0.18318837535492868, "learning_rate": 2.1988318113136396e-07, "loss": 0.053, "step": 1414 }, { "epoch": 1.8117797695262485, "grad_norm": 0.19590723577656866, "learning_rate": 2.169434296374756e-07, "loss": 0.0646, "step": 1415 }, { "epoch": 1.8130601792573624, "grad_norm": 0.20942550240218386, "learning_rate": 2.1402302645448903e-07, "loss": 0.0657, "step": 1416 }, { "epoch": 1.8143405889884763, "grad_norm": 0.21121323333173378, "learning_rate": 2.1112198339595325e-07, "loss": 0.0775, "step": 1417 }, { "epoch": 1.8156209987195902, "grad_norm": 0.20278116997207565, "learning_rate": 2.082403121970994e-07, "loss": 0.0666, "step": 1418 }, { "epoch": 1.8169014084507042, "grad_norm": 0.2111169180229197, "learning_rate": 2.053780245147996e-07, "loss": 0.0725, "step": 1419 }, { "epoch": 1.8181818181818183, "grad_norm": 0.20650359046146602, "learning_rate": 2.0253513192751374e-07, "loss": 0.0611, "step": 1420 }, { "epoch": 1.8194622279129322, "grad_norm": 0.20205220484442332, "learning_rate": 1.9971164593524493e-07, "loss": 0.0656, "step": 1421 }, { "epoch": 1.820742637644046, "grad_norm": 0.2042258318586851, "learning_rate": 1.969075779594948e-07, "loss": 0.0743, "step": 1422 }, { "epoch": 1.82202304737516, "grad_norm": 0.20499576189147584, "learning_rate": 1.941229393432159e-07, "loss": 0.0625, "step": 1423 }, { "epoch": 1.823303457106274, "grad_norm": 0.20313175592704724, "learning_rate": 1.913577413507628e-07, "loss": 0.066, "step": 1424 }, { "epoch": 1.824583866837388, "grad_norm": 0.20468965378896176, "learning_rate": 1.8861199516785223e-07, "loss": 0.0721, "step": 1425 }, { "epoch": 1.825864276568502, "grad_norm": 0.2062204208647408, "learning_rate": 1.8588571190151338e-07, "loss": 0.0749, "step": 1426 }, { "epoch": 1.827144686299616, "grad_norm": 0.19231980501587626, "learning_rate": 1.831789025800451e-07, "loss": 0.0581, "step": 1427 }, { "epoch": 1.8284250960307298, "grad_norm": 0.20889898207436958, "learning_rate": 1.8049157815297037e-07, "loss": 0.0791, "step": 1428 }, { "epoch": 1.8297055057618437, "grad_norm": 0.20464923524172784, "learning_rate": 1.7782374949099234e-07, "loss": 0.0602, "step": 1429 }, { "epoch": 1.8309859154929577, "grad_norm": 0.20451609627393177, "learning_rate": 1.7517542738595071e-07, "loss": 0.0691, "step": 1430 }, { "epoch": 1.8322663252240718, "grad_norm": 0.21088358338820315, "learning_rate": 1.7254662255077713e-07, "loss": 0.064, "step": 1431 }, { "epoch": 1.8335467349551857, "grad_norm": 0.19999371420241321, "learning_rate": 1.6993734561945198e-07, "loss": 0.0699, "step": 1432 }, { "epoch": 1.8348271446862996, "grad_norm": 0.22973642248696105, "learning_rate": 1.6734760714696374e-07, "loss": 0.0702, "step": 1433 }, { "epoch": 1.8361075544174135, "grad_norm": 0.20685577690462723, "learning_rate": 1.6477741760926315e-07, "loss": 0.064, "step": 1434 }, { "epoch": 1.8373879641485276, "grad_norm": 0.19034107393028568, "learning_rate": 1.6222678740322128e-07, "loss": 0.0613, "step": 1435 }, { "epoch": 1.8386683738796414, "grad_norm": 0.183960580124128, "learning_rate": 1.5969572684658986e-07, "loss": 0.0563, "step": 1436 }, { "epoch": 1.8399487836107555, "grad_norm": 0.19991408412894127, "learning_rate": 1.5718424617795602e-07, "loss": 0.0707, "step": 1437 }, { "epoch": 1.8412291933418694, "grad_norm": 0.20593674420047167, "learning_rate": 1.5469235555670537e-07, "loss": 0.0649, "step": 1438 }, { "epoch": 1.8425096030729833, "grad_norm": 0.2037324713397988, "learning_rate": 1.5222006506297515e-07, "loss": 0.0653, "step": 1439 }, { "epoch": 1.8437900128040972, "grad_norm": 0.2137248730111368, "learning_rate": 1.4976738469761886e-07, "loss": 0.0753, "step": 1440 }, { "epoch": 1.8450704225352113, "grad_norm": 0.20065948492897037, "learning_rate": 1.47334324382164e-07, "loss": 0.0607, "step": 1441 }, { "epoch": 1.8463508322663253, "grad_norm": 0.20261082328031546, "learning_rate": 1.4492089395876873e-07, "loss": 0.0669, "step": 1442 }, { "epoch": 1.8476312419974392, "grad_norm": 0.1972580782152948, "learning_rate": 1.4252710319018703e-07, "loss": 0.0627, "step": 1443 }, { "epoch": 1.848911651728553, "grad_norm": 0.2129790063656561, "learning_rate": 1.4015296175972748e-07, "loss": 0.0736, "step": 1444 }, { "epoch": 1.850192061459667, "grad_norm": 0.18954505811446765, "learning_rate": 1.3779847927121115e-07, "loss": 0.0595, "step": 1445 }, { "epoch": 1.851472471190781, "grad_norm": 0.19522260750763254, "learning_rate": 1.3546366524893828e-07, "loss": 0.0572, "step": 1446 }, { "epoch": 1.8527528809218952, "grad_norm": 0.21478038992971552, "learning_rate": 1.3314852913764442e-07, "loss": 0.0863, "step": 1447 }, { "epoch": 1.854033290653009, "grad_norm": 0.2137579027202419, "learning_rate": 1.3085308030246436e-07, "loss": 0.0638, "step": 1448 }, { "epoch": 1.855313700384123, "grad_norm": 0.1988136025846599, "learning_rate": 1.28577328028896e-07, "loss": 0.0624, "step": 1449 }, { "epoch": 1.8565941101152368, "grad_norm": 0.19553938553033282, "learning_rate": 1.2632128152276046e-07, "loss": 0.0639, "step": 1450 }, { "epoch": 1.8578745198463509, "grad_norm": 0.21041681079550378, "learning_rate": 1.240849499101643e-07, "loss": 0.0652, "step": 1451 }, { "epoch": 1.8591549295774648, "grad_norm": 0.5012586519737127, "learning_rate": 1.2186834223746612e-07, "loss": 0.0752, "step": 1452 }, { "epoch": 1.8604353393085789, "grad_norm": 0.19455330539796845, "learning_rate": 1.1967146747123626e-07, "loss": 0.0576, "step": 1453 }, { "epoch": 1.8617157490396927, "grad_norm": 0.21316863715084144, "learning_rate": 1.1749433449822156e-07, "loss": 0.0782, "step": 1454 }, { "epoch": 1.8629961587708066, "grad_norm": 0.20553955324868886, "learning_rate": 1.1533695212531115e-07, "loss": 0.0686, "step": 1455 }, { "epoch": 1.8642765685019205, "grad_norm": 0.21497821053099725, "learning_rate": 1.131993290794986e-07, "loss": 0.076, "step": 1456 }, { "epoch": 1.8655569782330346, "grad_norm": 0.1935883001897811, "learning_rate": 1.1108147400784808e-07, "loss": 0.059, "step": 1457 }, { "epoch": 1.8668373879641487, "grad_norm": 0.2049410446095373, "learning_rate": 1.0898339547745774e-07, "loss": 0.0718, "step": 1458 }, { "epoch": 1.8681177976952625, "grad_norm": 0.189300424887134, "learning_rate": 1.0690510197542692e-07, "loss": 0.0589, "step": 1459 }, { "epoch": 1.8693982074263764, "grad_norm": 0.22923696862347018, "learning_rate": 1.0484660190882123e-07, "loss": 0.0844, "step": 1460 }, { "epoch": 1.8706786171574903, "grad_norm": 0.21070229672284757, "learning_rate": 1.028079036046381e-07, "loss": 0.0766, "step": 1461 }, { "epoch": 1.8719590268886044, "grad_norm": 0.20995830861832052, "learning_rate": 1.0078901530977292e-07, "loss": 0.0749, "step": 1462 }, { "epoch": 1.8732394366197183, "grad_norm": 0.194168233081112, "learning_rate": 9.878994519098573e-08, "loss": 0.0668, "step": 1463 }, { "epoch": 1.8745198463508324, "grad_norm": 0.20305364871666273, "learning_rate": 9.681070133487014e-08, "loss": 0.0573, "step": 1464 }, { "epoch": 1.8758002560819462, "grad_norm": 0.1924549623073497, "learning_rate": 9.485129174781838e-08, "loss": 0.0573, "step": 1465 }, { "epoch": 1.87708066581306, "grad_norm": 0.2134007094155275, "learning_rate": 9.291172435598906e-08, "loss": 0.0681, "step": 1466 }, { "epoch": 1.878361075544174, "grad_norm": 0.2057969019823256, "learning_rate": 9.099200700527445e-08, "loss": 0.0637, "step": 1467 }, { "epoch": 1.879641485275288, "grad_norm": 0.21719020869575242, "learning_rate": 8.909214746127271e-08, "loss": 0.0812, "step": 1468 }, { "epoch": 1.8809218950064022, "grad_norm": 0.19588012392632634, "learning_rate": 8.721215340925182e-08, "loss": 0.0542, "step": 1469 }, { "epoch": 1.882202304737516, "grad_norm": 0.21193976870051412, "learning_rate": 8.535203245411961e-08, "loss": 0.0745, "step": 1470 }, { "epoch": 1.88348271446863, "grad_norm": 0.18925313637999477, "learning_rate": 8.351179212039651e-08, "loss": 0.0567, "step": 1471 }, { "epoch": 1.8847631241997438, "grad_norm": 0.21259732481459803, "learning_rate": 8.169143985217898e-08, "loss": 0.0733, "step": 1472 }, { "epoch": 1.8860435339308579, "grad_norm": 0.1986145219505155, "learning_rate": 7.989098301311559e-08, "loss": 0.0562, "step": 1473 }, { "epoch": 1.887323943661972, "grad_norm": 0.20013756141999367, "learning_rate": 7.81104288863721e-08, "loss": 0.0544, "step": 1474 }, { "epoch": 1.8886043533930859, "grad_norm": 0.22584492085019492, "learning_rate": 7.634978467460585e-08, "loss": 0.0787, "step": 1475 }, { "epoch": 1.8898847631241997, "grad_norm": 0.22945296296987813, "learning_rate": 7.460905749993474e-08, "loss": 0.0611, "step": 1476 }, { "epoch": 1.8911651728553136, "grad_norm": 0.20743198837504645, "learning_rate": 7.288825440390779e-08, "loss": 0.0699, "step": 1477 }, { "epoch": 1.8924455825864277, "grad_norm": 0.20526205747893375, "learning_rate": 7.118738234747847e-08, "loss": 0.0685, "step": 1478 }, { "epoch": 1.8937259923175416, "grad_norm": 0.25598881314793, "learning_rate": 6.950644821097641e-08, "loss": 0.0757, "step": 1479 }, { "epoch": 1.8950064020486557, "grad_norm": 0.2329825198634952, "learning_rate": 6.784545879407633e-08, "loss": 0.0636, "step": 1480 }, { "epoch": 1.8962868117797695, "grad_norm": 0.19698543474426225, "learning_rate": 6.620442081577528e-08, "loss": 0.0578, "step": 1481 }, { "epoch": 1.8975672215108834, "grad_norm": 0.20137914227631143, "learning_rate": 6.458334091436314e-08, "loss": 0.0699, "step": 1482 }, { "epoch": 1.8988476312419973, "grad_norm": 0.20246226248643925, "learning_rate": 6.298222564739387e-08, "loss": 0.0597, "step": 1483 }, { "epoch": 1.9001280409731114, "grad_norm": 0.20397571071403153, "learning_rate": 6.140108149166324e-08, "loss": 0.0529, "step": 1484 }, { "epoch": 1.9014084507042255, "grad_norm": 0.20741882976868115, "learning_rate": 5.983991484317997e-08, "loss": 0.0719, "step": 1485 }, { "epoch": 1.9026888604353394, "grad_norm": 0.20121671309758776, "learning_rate": 5.829873201713798e-08, "loss": 0.0713, "step": 1486 }, { "epoch": 1.9039692701664532, "grad_norm": 0.2005147954702294, "learning_rate": 5.677753924789586e-08, "loss": 0.0586, "step": 1487 }, { "epoch": 1.905249679897567, "grad_norm": 0.2121569590194792, "learning_rate": 5.527634268894744e-08, "loss": 0.0732, "step": 1488 }, { "epoch": 1.9065300896286812, "grad_norm": 0.18713146985448095, "learning_rate": 5.379514841289901e-08, "loss": 0.0582, "step": 1489 }, { "epoch": 1.9078104993597953, "grad_norm": 0.19850280824551791, "learning_rate": 5.233396241144328e-08, "loss": 0.0674, "step": 1490 }, { "epoch": 1.9090909090909092, "grad_norm": 0.1993436823368234, "learning_rate": 5.089279059533658e-08, "loss": 0.0641, "step": 1491 }, { "epoch": 1.910371318822023, "grad_norm": 0.20553420140832993, "learning_rate": 4.9471638794373887e-08, "loss": 0.0628, "step": 1492 }, { "epoch": 1.911651728553137, "grad_norm": 0.19278724644069462, "learning_rate": 4.8070512757366626e-08, "loss": 0.0554, "step": 1493 }, { "epoch": 1.912932138284251, "grad_norm": 0.20376972420140652, "learning_rate": 4.6689418152116585e-08, "loss": 0.0641, "step": 1494 }, { "epoch": 1.914212548015365, "grad_norm": 0.2266817821267302, "learning_rate": 4.532836056539702e-08, "loss": 0.0885, "step": 1495 }, { "epoch": 1.915492957746479, "grad_norm": 0.21597635996765827, "learning_rate": 4.398734550292716e-08, "loss": 0.0718, "step": 1496 }, { "epoch": 1.9167733674775929, "grad_norm": 0.20420790197425281, "learning_rate": 4.2666378389349396e-08, "loss": 0.0692, "step": 1497 }, { "epoch": 1.9180537772087067, "grad_norm": 0.19728941531935928, "learning_rate": 4.136546456821044e-08, "loss": 0.0631, "step": 1498 }, { "epoch": 1.9193341869398206, "grad_norm": 0.20583646042982434, "learning_rate": 4.0084609301936364e-08, "loss": 0.0704, "step": 1499 }, { "epoch": 1.9206145966709347, "grad_norm": 0.20349704435592017, "learning_rate": 3.8823817771814207e-08, "loss": 0.0746, "step": 1500 }, { "epoch": 1.9218950064020488, "grad_norm": 0.20380362218593492, "learning_rate": 3.7583095077968754e-08, "loss": 0.0616, "step": 1501 }, { "epoch": 1.9231754161331627, "grad_norm": 0.19076445380936458, "learning_rate": 3.6362446239343044e-08, "loss": 0.0594, "step": 1502 }, { "epoch": 1.9244558258642765, "grad_norm": 0.20905285911006427, "learning_rate": 3.5161876193677836e-08, "loss": 0.0762, "step": 1503 }, { "epoch": 1.9257362355953904, "grad_norm": 0.2042423625984266, "learning_rate": 3.398138979749166e-08, "loss": 0.0666, "step": 1504 }, { "epoch": 1.9270166453265045, "grad_norm": 0.21086072875647932, "learning_rate": 3.28209918260608e-08, "loss": 0.0761, "step": 1505 }, { "epoch": 1.9282970550576184, "grad_norm": 0.19400853082349276, "learning_rate": 3.168068697340043e-08, "loss": 0.062, "step": 1506 }, { "epoch": 1.9295774647887325, "grad_norm": 0.21457137674838256, "learning_rate": 3.0560479852246304e-08, "loss": 0.0707, "step": 1507 }, { "epoch": 1.9308578745198464, "grad_norm": 0.2320731089865819, "learning_rate": 2.9460374994034202e-08, "loss": 0.08, "step": 1508 }, { "epoch": 1.9321382842509602, "grad_norm": 0.19712962893663466, "learning_rate": 2.8380376848883285e-08, "loss": 0.0653, "step": 1509 }, { "epoch": 1.933418693982074, "grad_norm": 0.2041477762201593, "learning_rate": 2.7320489785576666e-08, "loss": 0.0685, "step": 1510 }, { "epoch": 1.9346991037131882, "grad_norm": 0.20089128123293434, "learning_rate": 2.6280718091545863e-08, "loss": 0.0703, "step": 1511 }, { "epoch": 1.9359795134443023, "grad_norm": 0.2086085013010463, "learning_rate": 2.526106597285194e-08, "loss": 0.0617, "step": 1512 }, { "epoch": 1.9372599231754162, "grad_norm": 0.19836733526099107, "learning_rate": 2.4261537554167157e-08, "loss": 0.0602, "step": 1513 }, { "epoch": 1.93854033290653, "grad_norm": 0.2093847757745558, "learning_rate": 2.3282136878761686e-08, "loss": 0.0608, "step": 1514 }, { "epoch": 1.939820742637644, "grad_norm": 0.20865206378563741, "learning_rate": 2.2322867908484147e-08, "loss": 0.067, "step": 1515 }, { "epoch": 1.941101152368758, "grad_norm": 0.24101199976207213, "learning_rate": 2.138373452374831e-08, "loss": 0.0806, "step": 1516 }, { "epoch": 1.9423815620998721, "grad_norm": 0.18692444097379804, "learning_rate": 2.0464740523514214e-08, "loss": 0.0588, "step": 1517 }, { "epoch": 1.943661971830986, "grad_norm": 0.20236804078412934, "learning_rate": 1.9565889625275945e-08, "loss": 0.0663, "step": 1518 }, { "epoch": 1.9449423815620999, "grad_norm": 0.20245417532304852, "learning_rate": 1.868718546504389e-08, "loss": 0.0624, "step": 1519 }, { "epoch": 1.9462227912932137, "grad_norm": 0.20111669507572685, "learning_rate": 1.7828631597333056e-08, "loss": 0.0657, "step": 1520 }, { "epoch": 1.9475032010243278, "grad_norm": 0.1854804000830463, "learning_rate": 1.6990231495144782e-08, "loss": 0.05, "step": 1521 }, { "epoch": 1.9487836107554417, "grad_norm": 0.19124827470933353, "learning_rate": 1.6171988549956718e-08, "loss": 0.0611, "step": 1522 }, { "epoch": 1.9500640204865558, "grad_norm": 0.1960702744274981, "learning_rate": 1.5373906071706192e-08, "loss": 0.0618, "step": 1523 }, { "epoch": 1.9513444302176697, "grad_norm": 0.1893947703161995, "learning_rate": 1.459598728877798e-08, "loss": 0.059, "step": 1524 }, { "epoch": 1.9526248399487836, "grad_norm": 0.20198668442581438, "learning_rate": 1.3838235347991558e-08, "loss": 0.074, "step": 1525 }, { "epoch": 1.9539052496798974, "grad_norm": 0.19638709745357963, "learning_rate": 1.3100653314587763e-08, "loss": 0.0625, "step": 1526 }, { "epoch": 1.9551856594110115, "grad_norm": 0.20002150407972272, "learning_rate": 1.238324417221659e-08, "loss": 0.0598, "step": 1527 }, { "epoch": 1.9564660691421256, "grad_norm": 0.19741690014803207, "learning_rate": 1.168601082292442e-08, "loss": 0.0603, "step": 1528 }, { "epoch": 1.9577464788732395, "grad_norm": 0.2149168971243108, "learning_rate": 1.1008956087144585e-08, "loss": 0.0837, "step": 1529 }, { "epoch": 1.9590268886043534, "grad_norm": 0.1921243700187775, "learning_rate": 1.0352082703682931e-08, "loss": 0.0634, "step": 1530 }, { "epoch": 1.9603072983354672, "grad_norm": 0.21074769668697263, "learning_rate": 9.715393329708944e-09, "loss": 0.0727, "step": 1531 }, { "epoch": 1.9615877080665813, "grad_norm": 0.2028820674017271, "learning_rate": 9.098890540742977e-09, "loss": 0.0631, "step": 1532 }, { "epoch": 1.9628681177976952, "grad_norm": 0.18956236574482416, "learning_rate": 8.502576830649034e-09, "loss": 0.0518, "step": 1533 }, { "epoch": 1.9641485275288093, "grad_norm": 0.20204262410800616, "learning_rate": 7.926454611621448e-09, "loss": 0.0619, "step": 1534 }, { "epoch": 1.9654289372599232, "grad_norm": 0.21077254425505618, "learning_rate": 7.3705262141759995e-09, "loss": 0.0617, "step": 1535 }, { "epoch": 1.966709346991037, "grad_norm": 0.21661741540787505, "learning_rate": 6.834793887142143e-09, "loss": 0.0552, "step": 1536 }, { "epoch": 1.967989756722151, "grad_norm": 0.2215528858593377, "learning_rate": 6.319259797651911e-09, "loss": 0.0745, "step": 1537 }, { "epoch": 1.969270166453265, "grad_norm": 0.18630682284883943, "learning_rate": 5.823926031132132e-09, "loss": 0.0484, "step": 1538 }, { "epoch": 1.9705505761843791, "grad_norm": 0.1974431443612188, "learning_rate": 5.348794591295004e-09, "loss": 0.0582, "step": 1539 }, { "epoch": 1.971830985915493, "grad_norm": 0.20150993056267935, "learning_rate": 4.89386740013198e-09, "loss": 0.0661, "step": 1540 }, { "epoch": 1.9731113956466069, "grad_norm": 0.20470996306813885, "learning_rate": 4.459146297903783e-09, "loss": 0.0749, "step": 1541 }, { "epoch": 1.9743918053777207, "grad_norm": 0.2061033200739359, "learning_rate": 4.044633043134294e-09, "loss": 0.0619, "step": 1542 }, { "epoch": 1.9756722151088348, "grad_norm": 0.20271743751150123, "learning_rate": 3.6503293126033402e-09, "loss": 0.0613, "step": 1543 }, { "epoch": 1.976952624839949, "grad_norm": 0.20490402834712004, "learning_rate": 3.2762367013394747e-09, "loss": 0.0694, "step": 1544 }, { "epoch": 1.9782330345710628, "grad_norm": 0.2119186487939203, "learning_rate": 2.9223567226127626e-09, "loss": 0.0739, "step": 1545 }, { "epoch": 1.9795134443021767, "grad_norm": 0.21803236604286477, "learning_rate": 2.5886908079308935e-09, "loss": 0.0568, "step": 1546 }, { "epoch": 1.9807938540332906, "grad_norm": 0.20619184711819616, "learning_rate": 2.275240307031412e-09, "loss": 0.0699, "step": 1547 }, { "epoch": 1.9820742637644047, "grad_norm": 0.1941770969857438, "learning_rate": 1.9820064878772748e-09, "loss": 0.0559, "step": 1548 }, { "epoch": 1.9833546734955185, "grad_norm": 0.18804896708872632, "learning_rate": 1.7089905366507453e-09, "loss": 0.0615, "step": 1549 }, { "epoch": 1.9846350832266326, "grad_norm": 0.2106463401125963, "learning_rate": 1.456193557750618e-09, "loss": 0.063, "step": 1550 }, { "epoch": 1.9859154929577465, "grad_norm": 0.19494170606960837, "learning_rate": 1.2236165737850025e-09, "loss": 0.0592, "step": 1551 }, { "epoch": 1.9871959026888604, "grad_norm": 0.2172364562258022, "learning_rate": 1.0112605255685471e-09, "loss": 0.0706, "step": 1552 }, { "epoch": 1.9884763124199742, "grad_norm": 0.2093839169633442, "learning_rate": 8.191262721196635e-10, "loss": 0.0737, "step": 1553 }, { "epoch": 1.9897567221510883, "grad_norm": 0.18837206670180967, "learning_rate": 6.472145906555316e-10, "loss": 0.0547, "step": 1554 }, { "epoch": 1.9910371318822024, "grad_norm": 0.1941186800350548, "learning_rate": 4.955261765882125e-10, "loss": 0.0606, "step": 1555 }, { "epoch": 1.9923175416133163, "grad_norm": 0.20306804847244714, "learning_rate": 3.6406164352575934e-10, "loss": 0.066, "step": 1556 }, { "epoch": 1.9935979513444302, "grad_norm": 0.20067518542060178, "learning_rate": 2.5282152326444596e-10, "loss": 0.0626, "step": 1557 }, { "epoch": 1.994878361075544, "grad_norm": 0.20020475918450265, "learning_rate": 1.6180626579043179e-10, "loss": 0.0646, "step": 1558 }, { "epoch": 1.9961587708066582, "grad_norm": 0.19166919169773772, "learning_rate": 9.101623927698644e-11, "loss": 0.0594, "step": 1559 }, { "epoch": 1.9974391805377723, "grad_norm": 0.20735248882758955, "learning_rate": 4.0451730081714216e-11, "loss": 0.0739, "step": 1560 }, { "epoch": 1.9987195902688861, "grad_norm": 0.18725676653291543, "learning_rate": 1.0112942747664278e-11, "loss": 0.0643, "step": 1561 }, { "epoch": 2.0, "grad_norm": 0.1872906769419353, "learning_rate": 0.0, "loss": 0.0606, "step": 1562 }, { "epoch": 2.0, "step": 1562, "total_flos": 88995614687232.0, "train_loss": 0.08387258817488268, "train_runtime": 2839.4866, "train_samples_per_second": 17.583, "train_steps_per_second": 0.55 } ], "logging_steps": 1, "max_steps": 1562, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 88995614687232.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }