diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,40227 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.40009755077866427, + "eval_steps": 500, + "global_step": 5742, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 6.967912761732224e-05, + "grad_norm": 2.733745568803013, + "learning_rate": 6.999999924522766e-07, + "loss": 1.7192, + "step": 1 + }, + { + "epoch": 0.00013935825523464447, + "grad_norm": 2.709369185413448, + "learning_rate": 6.99999969809107e-07, + "loss": 1.6019, + "step": 2 + }, + { + "epoch": 0.0002090373828519667, + "grad_norm": 2.9135830924208594, + "learning_rate": 6.999999320704921e-07, + "loss": 1.8484, + "step": 3 + }, + { + "epoch": 0.00027871651046928894, + "grad_norm": 2.777419413695461, + "learning_rate": 6.999998792364337e-07, + "loss": 1.756, + "step": 4 + }, + { + "epoch": 0.00034839563808661115, + "grad_norm": 2.8075005508831694, + "learning_rate": 6.999998113069345e-07, + "loss": 1.7913, + "step": 5 + }, + { + "epoch": 0.0004180747657039334, + "grad_norm": 2.668293350418484, + "learning_rate": 6.999997282819976e-07, + "loss": 1.6321, + "step": 6 + }, + { + "epoch": 0.0004877538933212556, + "grad_norm": 2.4745384488107187, + "learning_rate": 6.999996301616271e-07, + "loss": 1.5594, + "step": 7 + }, + { + "epoch": 0.0005574330209385779, + "grad_norm": 2.3430588976825826, + "learning_rate": 6.999995169458277e-07, + "loss": 1.6613, + "step": 8 + }, + { + "epoch": 0.0006271121485559001, + "grad_norm": 2.2357126724663092, + "learning_rate": 6.999993886346046e-07, + "loss": 1.676, + "step": 9 + }, + { + "epoch": 0.0006967912761732223, + "grad_norm": 2.342047279317775, + "learning_rate": 6.999992452279643e-07, + "loss": 1.6192, + "step": 10 + }, + { + "epoch": 0.0007664704037905445, + "grad_norm": 2.2540685801049944, + "learning_rate": 6.999990867259133e-07, + "loss": 1.559, + "step": 11 + }, + { + "epoch": 0.0008361495314078668, + "grad_norm": 1.971114382158942, + "learning_rate": 6.999989131284595e-07, + "loss": 1.6581, + "step": 12 + }, + { + "epoch": 0.000905828659025189, + "grad_norm": 1.5959396288528505, + "learning_rate": 6.999987244356111e-07, + "loss": 1.721, + "step": 13 + }, + { + "epoch": 0.0009755077866425112, + "grad_norm": 1.4800088775047004, + "learning_rate": 6.999985206473771e-07, + "loss": 1.7666, + "step": 14 + }, + { + "epoch": 0.0010451869142598335, + "grad_norm": 1.3687402013136123, + "learning_rate": 6.999983017637673e-07, + "loss": 1.726, + "step": 15 + }, + { + "epoch": 0.0011148660418771558, + "grad_norm": 1.391725060939141, + "learning_rate": 6.999980677847923e-07, + "loss": 1.7181, + "step": 16 + }, + { + "epoch": 0.0011845451694944779, + "grad_norm": 1.1938617951720005, + "learning_rate": 6.999978187104631e-07, + "loss": 1.5963, + "step": 17 + }, + { + "epoch": 0.0012542242971118002, + "grad_norm": 1.2751779681441469, + "learning_rate": 6.999975545407918e-07, + "loss": 1.6692, + "step": 18 + }, + { + "epoch": 0.0013239034247291223, + "grad_norm": 1.2381900634264318, + "learning_rate": 6.99997275275791e-07, + "loss": 1.7631, + "step": 19 + }, + { + "epoch": 0.0013935825523464446, + "grad_norm": 1.153546411353916, + "learning_rate": 6.999969809154741e-07, + "loss": 1.7649, + "step": 20 + }, + { + "epoch": 0.001463261679963767, + "grad_norm": 1.1646630042222352, + "learning_rate": 6.999966714598552e-07, + "loss": 1.7358, + "step": 21 + }, + { + "epoch": 0.001532940807581089, + "grad_norm": 1.1023651577627083, + "learning_rate": 6.99996346908949e-07, + "loss": 1.67, + "step": 22 + }, + { + "epoch": 0.0016026199351984113, + "grad_norm": 1.1875711007742904, + "learning_rate": 6.999960072627713e-07, + "loss": 1.622, + "step": 23 + }, + { + "epoch": 0.0016722990628157337, + "grad_norm": 1.0365697887615943, + "learning_rate": 6.999956525213383e-07, + "loss": 1.7234, + "step": 24 + }, + { + "epoch": 0.0017419781904330558, + "grad_norm": 0.9973919994515652, + "learning_rate": 6.99995282684667e-07, + "loss": 1.7004, + "step": 25 + }, + { + "epoch": 0.001811657318050378, + "grad_norm": 1.3266756191624058, + "learning_rate": 6.999948977527749e-07, + "loss": 1.7692, + "step": 26 + }, + { + "epoch": 0.0018813364456677002, + "grad_norm": 1.4466381604576397, + "learning_rate": 6.999944977256808e-07, + "loss": 1.4938, + "step": 27 + }, + { + "epoch": 0.0019510155732850225, + "grad_norm": 1.5268786523739337, + "learning_rate": 6.999940826034036e-07, + "loss": 1.7042, + "step": 28 + }, + { + "epoch": 0.002020694700902345, + "grad_norm": 1.4725142311197557, + "learning_rate": 6.999936523859633e-07, + "loss": 1.4254, + "step": 29 + }, + { + "epoch": 0.002090373828519667, + "grad_norm": 1.4663543510668822, + "learning_rate": 6.999932070733806e-07, + "loss": 1.5621, + "step": 30 + }, + { + "epoch": 0.002160052956136989, + "grad_norm": 1.5525614196605468, + "learning_rate": 6.999927466656765e-07, + "loss": 1.7332, + "step": 31 + }, + { + "epoch": 0.0022297320837543115, + "grad_norm": 1.3577154717206668, + "learning_rate": 6.999922711628735e-07, + "loss": 1.5684, + "step": 32 + }, + { + "epoch": 0.0022994112113716336, + "grad_norm": 1.2464378205015698, + "learning_rate": 6.999917805649943e-07, + "loss": 1.7046, + "step": 33 + }, + { + "epoch": 0.0023690903389889557, + "grad_norm": 1.2571057892933462, + "learning_rate": 6.999912748720621e-07, + "loss": 1.6985, + "step": 34 + }, + { + "epoch": 0.0024387694666062783, + "grad_norm": 1.046761844185471, + "learning_rate": 6.999907540841015e-07, + "loss": 1.5903, + "step": 35 + }, + { + "epoch": 0.0025084485942236004, + "grad_norm": 1.0190382659116664, + "learning_rate": 6.999902182011371e-07, + "loss": 1.6446, + "step": 36 + }, + { + "epoch": 0.0025781277218409225, + "grad_norm": 0.9420347898121777, + "learning_rate": 6.999896672231951e-07, + "loss": 1.6862, + "step": 37 + }, + { + "epoch": 0.0026478068494582446, + "grad_norm": 0.9594587432273478, + "learning_rate": 6.999891011503013e-07, + "loss": 1.6502, + "step": 38 + }, + { + "epoch": 0.002717485977075567, + "grad_norm": 0.8779244551322024, + "learning_rate": 6.999885199824832e-07, + "loss": 1.5531, + "step": 39 + }, + { + "epoch": 0.002787165104692889, + "grad_norm": 0.8480799062449409, + "learning_rate": 6.999879237197686e-07, + "loss": 1.4879, + "step": 40 + }, + { + "epoch": 0.0028568442323102113, + "grad_norm": 0.9497551105971518, + "learning_rate": 6.99987312362186e-07, + "loss": 1.6479, + "step": 41 + }, + { + "epoch": 0.002926523359927534, + "grad_norm": 0.8618279518712127, + "learning_rate": 6.999866859097648e-07, + "loss": 1.662, + "step": 42 + }, + { + "epoch": 0.002996202487544856, + "grad_norm": 0.9018906795561105, + "learning_rate": 6.999860443625348e-07, + "loss": 1.5774, + "step": 43 + }, + { + "epoch": 0.003065881615162178, + "grad_norm": 0.8612830023344303, + "learning_rate": 6.999853877205271e-07, + "loss": 1.8141, + "step": 44 + }, + { + "epoch": 0.0031355607427795006, + "grad_norm": 0.9007192891107818, + "learning_rate": 6.999847159837729e-07, + "loss": 1.6209, + "step": 45 + }, + { + "epoch": 0.0032052398703968227, + "grad_norm": 0.8585620295584652, + "learning_rate": 6.999840291523043e-07, + "loss": 1.6896, + "step": 46 + }, + { + "epoch": 0.0032749189980141448, + "grad_norm": 0.8582642386199358, + "learning_rate": 6.999833272261546e-07, + "loss": 1.5751, + "step": 47 + }, + { + "epoch": 0.0033445981256314673, + "grad_norm": 0.9273580515998116, + "learning_rate": 6.999826102053571e-07, + "loss": 1.5397, + "step": 48 + }, + { + "epoch": 0.0034142772532487894, + "grad_norm": 0.8614901819193356, + "learning_rate": 6.999818780899462e-07, + "loss": 1.5651, + "step": 49 + }, + { + "epoch": 0.0034839563808661115, + "grad_norm": 0.8055350569759604, + "learning_rate": 6.999811308799572e-07, + "loss": 1.5396, + "step": 50 + }, + { + "epoch": 0.0035536355084834336, + "grad_norm": 0.7667301976321873, + "learning_rate": 6.999803685754257e-07, + "loss": 1.5635, + "step": 51 + }, + { + "epoch": 0.003623314636100756, + "grad_norm": 0.7558223853302638, + "learning_rate": 6.999795911763883e-07, + "loss": 1.4957, + "step": 52 + }, + { + "epoch": 0.0036929937637180782, + "grad_norm": 0.8569578181282904, + "learning_rate": 6.999787986828822e-07, + "loss": 1.5791, + "step": 53 + }, + { + "epoch": 0.0037626728913354003, + "grad_norm": 0.8084428013047424, + "learning_rate": 6.999779910949456e-07, + "loss": 1.6145, + "step": 54 + }, + { + "epoch": 0.003832352018952723, + "grad_norm": 0.7559081128636418, + "learning_rate": 6.999771684126168e-07, + "loss": 1.5544, + "step": 55 + }, + { + "epoch": 0.003902031146570045, + "grad_norm": 0.7689467380996695, + "learning_rate": 6.999763306359357e-07, + "loss": 1.5876, + "step": 56 + }, + { + "epoch": 0.003971710274187367, + "grad_norm": 0.7544375614009099, + "learning_rate": 6.99975477764942e-07, + "loss": 1.5481, + "step": 57 + }, + { + "epoch": 0.00404138940180469, + "grad_norm": 0.7831679946606233, + "learning_rate": 6.999746097996769e-07, + "loss": 1.5945, + "step": 58 + }, + { + "epoch": 0.004111068529422011, + "grad_norm": 0.750603996125084, + "learning_rate": 6.999737267401817e-07, + "loss": 1.5258, + "step": 59 + }, + { + "epoch": 0.004180747657039334, + "grad_norm": 0.7953623833277211, + "learning_rate": 6.999728285864989e-07, + "loss": 1.5848, + "step": 60 + }, + { + "epoch": 0.004250426784656656, + "grad_norm": 0.7574855755742249, + "learning_rate": 6.999719153386717e-07, + "loss": 1.621, + "step": 61 + }, + { + "epoch": 0.004320105912273978, + "grad_norm": 0.776253899297405, + "learning_rate": 6.999709869967437e-07, + "loss": 1.6088, + "step": 62 + }, + { + "epoch": 0.0043897850398913005, + "grad_norm": 0.7610571494782742, + "learning_rate": 6.999700435607591e-07, + "loss": 1.5922, + "step": 63 + }, + { + "epoch": 0.004459464167508623, + "grad_norm": 0.8152423242847946, + "learning_rate": 6.999690850307637e-07, + "loss": 1.6002, + "step": 64 + }, + { + "epoch": 0.004529143295125945, + "grad_norm": 0.7968474238058425, + "learning_rate": 6.99968111406803e-07, + "loss": 1.5156, + "step": 65 + }, + { + "epoch": 0.004598822422743267, + "grad_norm": 0.7974893416155076, + "learning_rate": 6.999671226889238e-07, + "loss": 1.655, + "step": 66 + }, + { + "epoch": 0.00466850155036059, + "grad_norm": 0.7912729979364881, + "learning_rate": 6.999661188771734e-07, + "loss": 1.628, + "step": 67 + }, + { + "epoch": 0.0047381806779779115, + "grad_norm": 0.7550934267500832, + "learning_rate": 6.999650999716002e-07, + "loss": 1.7023, + "step": 68 + }, + { + "epoch": 0.004807859805595234, + "grad_norm": 0.7295311010941107, + "learning_rate": 6.999640659722525e-07, + "loss": 1.589, + "step": 69 + }, + { + "epoch": 0.0048775389332125565, + "grad_norm": 0.7758262875672367, + "learning_rate": 6.999630168791802e-07, + "loss": 1.6251, + "step": 70 + }, + { + "epoch": 0.004947218060829878, + "grad_norm": 0.7449174445401683, + "learning_rate": 6.999619526924338e-07, + "loss": 1.5023, + "step": 71 + }, + { + "epoch": 0.005016897188447201, + "grad_norm": 0.7692068872313949, + "learning_rate": 6.999608734120638e-07, + "loss": 1.6567, + "step": 72 + }, + { + "epoch": 0.005086576316064523, + "grad_norm": 0.8275918260872214, + "learning_rate": 6.999597790381221e-07, + "loss": 1.708, + "step": 73 + }, + { + "epoch": 0.005156255443681845, + "grad_norm": 0.7872033802821833, + "learning_rate": 6.999586695706612e-07, + "loss": 1.7163, + "step": 74 + }, + { + "epoch": 0.0052259345712991675, + "grad_norm": 0.8159200519159217, + "learning_rate": 6.999575450097343e-07, + "loss": 1.5663, + "step": 75 + }, + { + "epoch": 0.005295613698916489, + "grad_norm": 0.7956432710560783, + "learning_rate": 6.999564053553953e-07, + "loss": 1.697, + "step": 76 + }, + { + "epoch": 0.005365292826533812, + "grad_norm": 0.7130908379784697, + "learning_rate": 6.999552506076987e-07, + "loss": 1.6055, + "step": 77 + }, + { + "epoch": 0.005434971954151134, + "grad_norm": 0.7274551108988991, + "learning_rate": 6.999540807667e-07, + "loss": 1.5876, + "step": 78 + }, + { + "epoch": 0.005504651081768456, + "grad_norm": 0.7646927508527166, + "learning_rate": 6.99952895832455e-07, + "loss": 1.6219, + "step": 79 + }, + { + "epoch": 0.005574330209385778, + "grad_norm": 0.8844817783804978, + "learning_rate": 6.999516958050207e-07, + "loss": 1.5852, + "step": 80 + }, + { + "epoch": 0.005644009337003101, + "grad_norm": 0.8046092626958843, + "learning_rate": 6.999504806844547e-07, + "loss": 1.6941, + "step": 81 + }, + { + "epoch": 0.005713688464620423, + "grad_norm": 0.7565003415350436, + "learning_rate": 6.999492504708149e-07, + "loss": 1.5299, + "step": 82 + }, + { + "epoch": 0.005783367592237745, + "grad_norm": 0.7491176510626603, + "learning_rate": 6.999480051641605e-07, + "loss": 1.4925, + "step": 83 + }, + { + "epoch": 0.005853046719855068, + "grad_norm": 0.7518065408047131, + "learning_rate": 6.999467447645511e-07, + "loss": 1.6606, + "step": 84 + }, + { + "epoch": 0.005922725847472389, + "grad_norm": 0.7208224360353647, + "learning_rate": 6.999454692720472e-07, + "loss": 1.5146, + "step": 85 + }, + { + "epoch": 0.005992404975089712, + "grad_norm": 0.7845222532902206, + "learning_rate": 6.999441786867097e-07, + "loss": 1.6499, + "step": 86 + }, + { + "epoch": 0.006062084102707034, + "grad_norm": 0.7179978138866828, + "learning_rate": 6.999428730086007e-07, + "loss": 1.6328, + "step": 87 + }, + { + "epoch": 0.006131763230324356, + "grad_norm": 0.7211806613439429, + "learning_rate": 6.999415522377827e-07, + "loss": 1.5726, + "step": 88 + }, + { + "epoch": 0.006201442357941679, + "grad_norm": 0.7546953013822226, + "learning_rate": 6.999402163743188e-07, + "loss": 1.5601, + "step": 89 + }, + { + "epoch": 0.006271121485559001, + "grad_norm": 0.7566779977768506, + "learning_rate": 6.999388654182733e-07, + "loss": 1.684, + "step": 90 + }, + { + "epoch": 0.006340800613176323, + "grad_norm": 0.6969488028265275, + "learning_rate": 6.999374993697108e-07, + "loss": 1.5598, + "step": 91 + }, + { + "epoch": 0.006410479740793645, + "grad_norm": 0.705981536567612, + "learning_rate": 6.999361182286967e-07, + "loss": 1.5545, + "step": 92 + }, + { + "epoch": 0.006480158868410968, + "grad_norm": 0.8466488562144117, + "learning_rate": 6.999347219952973e-07, + "loss": 1.6598, + "step": 93 + }, + { + "epoch": 0.0065498379960282896, + "grad_norm": 0.7800529271957665, + "learning_rate": 6.999333106695795e-07, + "loss": 1.6096, + "step": 94 + }, + { + "epoch": 0.006619517123645612, + "grad_norm": 0.7752456360469325, + "learning_rate": 6.999318842516109e-07, + "loss": 1.6597, + "step": 95 + }, + { + "epoch": 0.006689196251262935, + "grad_norm": 0.6961730716580421, + "learning_rate": 6.999304427414599e-07, + "loss": 1.4991, + "step": 96 + }, + { + "epoch": 0.006758875378880256, + "grad_norm": 0.7111901307005418, + "learning_rate": 6.999289861391954e-07, + "loss": 1.5348, + "step": 97 + }, + { + "epoch": 0.006828554506497579, + "grad_norm": 0.7729231425029259, + "learning_rate": 6.999275144448875e-07, + "loss": 1.7086, + "step": 98 + }, + { + "epoch": 0.0068982336341149005, + "grad_norm": 0.7207807898538051, + "learning_rate": 6.999260276586065e-07, + "loss": 1.4942, + "step": 99 + }, + { + "epoch": 0.006967912761732223, + "grad_norm": 0.8209832140800927, + "learning_rate": 6.999245257804236e-07, + "loss": 1.6462, + "step": 100 + }, + { + "epoch": 0.0070375918893495456, + "grad_norm": 0.7372508955817842, + "learning_rate": 6.999230088104111e-07, + "loss": 1.6425, + "step": 101 + }, + { + "epoch": 0.007107271016966867, + "grad_norm": 0.7103080276487075, + "learning_rate": 6.999214767486415e-07, + "loss": 1.5064, + "step": 102 + }, + { + "epoch": 0.00717695014458419, + "grad_norm": 0.7453563805733722, + "learning_rate": 6.999199295951881e-07, + "loss": 1.5735, + "step": 103 + }, + { + "epoch": 0.007246629272201512, + "grad_norm": 0.7316234271209868, + "learning_rate": 6.999183673501252e-07, + "loss": 1.487, + "step": 104 + }, + { + "epoch": 0.007316308399818834, + "grad_norm": 0.8306495639187648, + "learning_rate": 6.999167900135276e-07, + "loss": 1.6901, + "step": 105 + }, + { + "epoch": 0.0073859875274361565, + "grad_norm": 0.6877525548972586, + "learning_rate": 6.99915197585471e-07, + "loss": 1.5244, + "step": 106 + }, + { + "epoch": 0.007455666655053479, + "grad_norm": 0.7545823598451985, + "learning_rate": 6.999135900660315e-07, + "loss": 1.6525, + "step": 107 + }, + { + "epoch": 0.007525345782670801, + "grad_norm": 0.7557325144444581, + "learning_rate": 6.999119674552864e-07, + "loss": 1.5528, + "step": 108 + }, + { + "epoch": 0.007595024910288123, + "grad_norm": 0.7363162596156012, + "learning_rate": 6.999103297533131e-07, + "loss": 1.6752, + "step": 109 + }, + { + "epoch": 0.007664704037905446, + "grad_norm": 0.7779785261187538, + "learning_rate": 6.999086769601905e-07, + "loss": 1.6054, + "step": 110 + }, + { + "epoch": 0.007734383165522767, + "grad_norm": 0.7375695814517242, + "learning_rate": 6.999070090759976e-07, + "loss": 1.6362, + "step": 111 + }, + { + "epoch": 0.00780406229314009, + "grad_norm": 0.7295801652020935, + "learning_rate": 6.999053261008142e-07, + "loss": 1.6278, + "step": 112 + }, + { + "epoch": 0.007873741420757412, + "grad_norm": 0.7564600054233208, + "learning_rate": 6.999036280347211e-07, + "loss": 1.5602, + "step": 113 + }, + { + "epoch": 0.007943420548374734, + "grad_norm": 0.7325116000746935, + "learning_rate": 6.999019148777999e-07, + "loss": 1.5854, + "step": 114 + }, + { + "epoch": 0.008013099675992057, + "grad_norm": 0.750333027438248, + "learning_rate": 6.999001866301322e-07, + "loss": 1.6378, + "step": 115 + }, + { + "epoch": 0.00808277880360938, + "grad_norm": 0.7077391988654039, + "learning_rate": 6.998984432918011e-07, + "loss": 1.4496, + "step": 116 + }, + { + "epoch": 0.008152457931226702, + "grad_norm": 0.7061194584771281, + "learning_rate": 6.998966848628901e-07, + "loss": 1.7211, + "step": 117 + }, + { + "epoch": 0.008222137058844023, + "grad_norm": 0.753741884650152, + "learning_rate": 6.998949113434836e-07, + "loss": 1.5725, + "step": 118 + }, + { + "epoch": 0.008291816186461345, + "grad_norm": 0.8776126975121025, + "learning_rate": 6.998931227336665e-07, + "loss": 1.4603, + "step": 119 + }, + { + "epoch": 0.008361495314078668, + "grad_norm": 0.7457924178802692, + "learning_rate": 6.998913190335243e-07, + "loss": 1.5817, + "step": 120 + }, + { + "epoch": 0.00843117444169599, + "grad_norm": 0.6662869164399065, + "learning_rate": 6.998895002431439e-07, + "loss": 1.4134, + "step": 121 + }, + { + "epoch": 0.008500853569313313, + "grad_norm": 0.730929392364289, + "learning_rate": 6.998876663626121e-07, + "loss": 1.6551, + "step": 122 + }, + { + "epoch": 0.008570532696930635, + "grad_norm": 0.7485428242937736, + "learning_rate": 6.998858173920168e-07, + "loss": 1.5917, + "step": 123 + }, + { + "epoch": 0.008640211824547956, + "grad_norm": 0.7041960816293819, + "learning_rate": 6.998839533314466e-07, + "loss": 1.5663, + "step": 124 + }, + { + "epoch": 0.008709890952165279, + "grad_norm": 0.7065757407197867, + "learning_rate": 6.998820741809911e-07, + "loss": 1.5738, + "step": 125 + }, + { + "epoch": 0.008779570079782601, + "grad_norm": 0.7167544228047786, + "learning_rate": 6.998801799407402e-07, + "loss": 1.4507, + "step": 126 + }, + { + "epoch": 0.008849249207399924, + "grad_norm": 0.7373858233625152, + "learning_rate": 6.998782706107846e-07, + "loss": 1.5638, + "step": 127 + }, + { + "epoch": 0.008918928335017246, + "grad_norm": 0.762790306505906, + "learning_rate": 6.998763461912157e-07, + "loss": 1.5871, + "step": 128 + }, + { + "epoch": 0.008988607462634567, + "grad_norm": 0.7997672459056562, + "learning_rate": 6.998744066821259e-07, + "loss": 1.7292, + "step": 129 + }, + { + "epoch": 0.00905828659025189, + "grad_norm": 0.6949799061752019, + "learning_rate": 6.998724520836082e-07, + "loss": 1.557, + "step": 130 + }, + { + "epoch": 0.009127965717869212, + "grad_norm": 0.7429111002352855, + "learning_rate": 6.998704823957561e-07, + "loss": 1.6075, + "step": 131 + }, + { + "epoch": 0.009197644845486535, + "grad_norm": 0.7247660368087409, + "learning_rate": 6.99868497618664e-07, + "loss": 1.4907, + "step": 132 + }, + { + "epoch": 0.009267323973103857, + "grad_norm": 0.753497515434336, + "learning_rate": 6.998664977524273e-07, + "loss": 1.5622, + "step": 133 + }, + { + "epoch": 0.00933700310072118, + "grad_norm": 0.7525401766307482, + "learning_rate": 6.998644827971414e-07, + "loss": 1.6525, + "step": 134 + }, + { + "epoch": 0.0094066822283385, + "grad_norm": 0.7719829875157536, + "learning_rate": 6.998624527529032e-07, + "loss": 1.7565, + "step": 135 + }, + { + "epoch": 0.009476361355955823, + "grad_norm": 0.7780799099616043, + "learning_rate": 6.998604076198099e-07, + "loss": 1.5999, + "step": 136 + }, + { + "epoch": 0.009546040483573145, + "grad_norm": 0.6788851566259799, + "learning_rate": 6.998583473979593e-07, + "loss": 1.5832, + "step": 137 + }, + { + "epoch": 0.009615719611190468, + "grad_norm": 0.6759649759261548, + "learning_rate": 6.998562720874505e-07, + "loss": 1.5318, + "step": 138 + }, + { + "epoch": 0.00968539873880779, + "grad_norm": 0.722411480856436, + "learning_rate": 6.998541816883826e-07, + "loss": 1.4956, + "step": 139 + }, + { + "epoch": 0.009755077866425113, + "grad_norm": 0.7963536281427879, + "learning_rate": 6.99852076200856e-07, + "loss": 1.6532, + "step": 140 + }, + { + "epoch": 0.009824756994042434, + "grad_norm": 0.711429048225027, + "learning_rate": 6.998499556249715e-07, + "loss": 1.5737, + "step": 141 + }, + { + "epoch": 0.009894436121659756, + "grad_norm": 0.6897849482508333, + "learning_rate": 6.998478199608309e-07, + "loss": 1.5071, + "step": 142 + }, + { + "epoch": 0.009964115249277079, + "grad_norm": 0.699945633650181, + "learning_rate": 6.998456692085361e-07, + "loss": 1.5943, + "step": 143 + }, + { + "epoch": 0.010033794376894401, + "grad_norm": 0.6689453841253, + "learning_rate": 6.998435033681907e-07, + "loss": 1.4587, + "step": 144 + }, + { + "epoch": 0.010103473504511724, + "grad_norm": 0.7133832784569326, + "learning_rate": 6.998413224398982e-07, + "loss": 1.5861, + "step": 145 + }, + { + "epoch": 0.010173152632129047, + "grad_norm": 0.7036410895242622, + "learning_rate": 6.998391264237631e-07, + "loss": 1.6346, + "step": 146 + }, + { + "epoch": 0.010242831759746367, + "grad_norm": 0.6886627438173859, + "learning_rate": 6.998369153198908e-07, + "loss": 1.5589, + "step": 147 + }, + { + "epoch": 0.01031251088736369, + "grad_norm": 0.7367558477965669, + "learning_rate": 6.998346891283872e-07, + "loss": 1.6064, + "step": 148 + }, + { + "epoch": 0.010382190014981012, + "grad_norm": 0.8105693813585202, + "learning_rate": 6.998324478493588e-07, + "loss": 1.6741, + "step": 149 + }, + { + "epoch": 0.010451869142598335, + "grad_norm": 0.7110523737644423, + "learning_rate": 6.998301914829133e-07, + "loss": 1.5858, + "step": 150 + }, + { + "epoch": 0.010521548270215658, + "grad_norm": 0.7292303967140454, + "learning_rate": 6.998279200291587e-07, + "loss": 1.644, + "step": 151 + }, + { + "epoch": 0.010591227397832978, + "grad_norm": 0.7220512349531498, + "learning_rate": 6.998256334882037e-07, + "loss": 1.5486, + "step": 152 + }, + { + "epoch": 0.0106609065254503, + "grad_norm": 0.7698867140395331, + "learning_rate": 6.998233318601581e-07, + "loss": 1.5194, + "step": 153 + }, + { + "epoch": 0.010730585653067623, + "grad_norm": 0.9732965207520899, + "learning_rate": 6.998210151451322e-07, + "loss": 1.5542, + "step": 154 + }, + { + "epoch": 0.010800264780684946, + "grad_norm": 0.7118245740667292, + "learning_rate": 6.998186833432369e-07, + "loss": 1.5825, + "step": 155 + }, + { + "epoch": 0.010869943908302268, + "grad_norm": 0.6767247887601149, + "learning_rate": 6.998163364545839e-07, + "loss": 1.5421, + "step": 156 + }, + { + "epoch": 0.010939623035919591, + "grad_norm": 0.7463837156055579, + "learning_rate": 6.998139744792858e-07, + "loss": 1.3482, + "step": 157 + }, + { + "epoch": 0.011009302163536912, + "grad_norm": 0.7023208534797809, + "learning_rate": 6.998115974174558e-07, + "loss": 1.5365, + "step": 158 + }, + { + "epoch": 0.011078981291154234, + "grad_norm": 0.7779897717499841, + "learning_rate": 6.998092052692076e-07, + "loss": 1.5526, + "step": 159 + }, + { + "epoch": 0.011148660418771557, + "grad_norm": 0.7022624765789008, + "learning_rate": 6.998067980346561e-07, + "loss": 1.5957, + "step": 160 + }, + { + "epoch": 0.01121833954638888, + "grad_norm": 0.7690238675537974, + "learning_rate": 6.998043757139166e-07, + "loss": 1.6412, + "step": 161 + }, + { + "epoch": 0.011288018674006202, + "grad_norm": 0.6977898185845065, + "learning_rate": 6.99801938307105e-07, + "loss": 1.5129, + "step": 162 + }, + { + "epoch": 0.011357697801623524, + "grad_norm": 0.7360024484783224, + "learning_rate": 6.997994858143384e-07, + "loss": 1.6246, + "step": 163 + }, + { + "epoch": 0.011427376929240845, + "grad_norm": 0.743352598561228, + "learning_rate": 6.997970182357341e-07, + "loss": 1.6589, + "step": 164 + }, + { + "epoch": 0.011497056056858168, + "grad_norm": 0.7438090842992685, + "learning_rate": 6.997945355714104e-07, + "loss": 1.7993, + "step": 165 + }, + { + "epoch": 0.01156673518447549, + "grad_norm": 0.7806976756154644, + "learning_rate": 6.997920378214862e-07, + "loss": 1.6904, + "step": 166 + }, + { + "epoch": 0.011636414312092813, + "grad_norm": 0.7617691034349805, + "learning_rate": 6.997895249860815e-07, + "loss": 1.5514, + "step": 167 + }, + { + "epoch": 0.011706093439710135, + "grad_norm": 0.6897868904950596, + "learning_rate": 6.997869970653164e-07, + "loss": 1.7041, + "step": 168 + }, + { + "epoch": 0.011775772567327458, + "grad_norm": 0.7171263670275106, + "learning_rate": 6.997844540593121e-07, + "loss": 1.5569, + "step": 169 + }, + { + "epoch": 0.011845451694944779, + "grad_norm": 0.7349273462264111, + "learning_rate": 6.997818959681906e-07, + "loss": 1.6214, + "step": 170 + }, + { + "epoch": 0.011915130822562101, + "grad_norm": 0.7595114670832553, + "learning_rate": 6.997793227920744e-07, + "loss": 1.7286, + "step": 171 + }, + { + "epoch": 0.011984809950179424, + "grad_norm": 0.6743455616725484, + "learning_rate": 6.997767345310868e-07, + "loss": 1.5967, + "step": 172 + }, + { + "epoch": 0.012054489077796746, + "grad_norm": 0.8442688339279371, + "learning_rate": 6.997741311853519e-07, + "loss": 1.5291, + "step": 173 + }, + { + "epoch": 0.012124168205414069, + "grad_norm": 0.7135683838950063, + "learning_rate": 6.997715127549944e-07, + "loss": 1.547, + "step": 174 + }, + { + "epoch": 0.01219384733303139, + "grad_norm": 0.7680553601030344, + "learning_rate": 6.997688792401398e-07, + "loss": 1.5167, + "step": 175 + }, + { + "epoch": 0.012263526460648712, + "grad_norm": 0.7351716147785469, + "learning_rate": 6.997662306409142e-07, + "loss": 1.5149, + "step": 176 + }, + { + "epoch": 0.012333205588266035, + "grad_norm": 0.708247558633638, + "learning_rate": 6.997635669574448e-07, + "loss": 1.5914, + "step": 177 + }, + { + "epoch": 0.012402884715883357, + "grad_norm": 0.7447006944945493, + "learning_rate": 6.997608881898589e-07, + "loss": 1.6293, + "step": 178 + }, + { + "epoch": 0.01247256384350068, + "grad_norm": 0.7344799477527223, + "learning_rate": 6.997581943382852e-07, + "loss": 1.6402, + "step": 179 + }, + { + "epoch": 0.012542242971118002, + "grad_norm": 0.7278240397379574, + "learning_rate": 6.997554854028525e-07, + "loss": 1.5459, + "step": 180 + }, + { + "epoch": 0.012611922098735323, + "grad_norm": 0.7020532255212869, + "learning_rate": 6.997527613836908e-07, + "loss": 1.5233, + "step": 181 + }, + { + "epoch": 0.012681601226352646, + "grad_norm": 0.7588235184758088, + "learning_rate": 6.997500222809307e-07, + "loss": 1.5568, + "step": 182 + }, + { + "epoch": 0.012751280353969968, + "grad_norm": 0.7530180397421503, + "learning_rate": 6.997472680947033e-07, + "loss": 1.5685, + "step": 183 + }, + { + "epoch": 0.01282095948158729, + "grad_norm": 0.7180796924001129, + "learning_rate": 6.997444988251405e-07, + "loss": 1.6382, + "step": 184 + }, + { + "epoch": 0.012890638609204613, + "grad_norm": 0.7664151954290458, + "learning_rate": 6.997417144723754e-07, + "loss": 1.5858, + "step": 185 + }, + { + "epoch": 0.012960317736821936, + "grad_norm": 0.7437188630824249, + "learning_rate": 6.99738915036541e-07, + "loss": 1.6035, + "step": 186 + }, + { + "epoch": 0.013029996864439257, + "grad_norm": 0.7201391130892155, + "learning_rate": 6.997361005177718e-07, + "loss": 1.5728, + "step": 187 + }, + { + "epoch": 0.013099675992056579, + "grad_norm": 0.7259717563800355, + "learning_rate": 6.997332709162024e-07, + "loss": 1.6235, + "step": 188 + }, + { + "epoch": 0.013169355119673902, + "grad_norm": 0.8047726194305217, + "learning_rate": 6.997304262319686e-07, + "loss": 1.7083, + "step": 189 + }, + { + "epoch": 0.013239034247291224, + "grad_norm": 0.7286276274616162, + "learning_rate": 6.997275664652066e-07, + "loss": 1.65, + "step": 190 + }, + { + "epoch": 0.013308713374908547, + "grad_norm": 0.7179888240618036, + "learning_rate": 6.997246916160535e-07, + "loss": 1.4977, + "step": 191 + }, + { + "epoch": 0.01337839250252587, + "grad_norm": 0.6454186584546411, + "learning_rate": 6.99721801684647e-07, + "loss": 1.4368, + "step": 192 + }, + { + "epoch": 0.01344807163014319, + "grad_norm": 0.8428174000154314, + "learning_rate": 6.997188966711259e-07, + "loss": 1.6111, + "step": 193 + }, + { + "epoch": 0.013517750757760513, + "grad_norm": 0.7142931494445387, + "learning_rate": 6.997159765756289e-07, + "loss": 1.4782, + "step": 194 + }, + { + "epoch": 0.013587429885377835, + "grad_norm": 0.7013011170224142, + "learning_rate": 6.997130413982963e-07, + "loss": 1.6277, + "step": 195 + }, + { + "epoch": 0.013657109012995158, + "grad_norm": 0.7323477085758628, + "learning_rate": 6.997100911392687e-07, + "loss": 1.6999, + "step": 196 + }, + { + "epoch": 0.01372678814061248, + "grad_norm": 0.6782232596228719, + "learning_rate": 6.997071257986873e-07, + "loss": 1.5167, + "step": 197 + }, + { + "epoch": 0.013796467268229801, + "grad_norm": 0.7661525942281635, + "learning_rate": 6.997041453766945e-07, + "loss": 1.5548, + "step": 198 + }, + { + "epoch": 0.013866146395847124, + "grad_norm": 0.7078911668543628, + "learning_rate": 6.997011498734329e-07, + "loss": 1.6429, + "step": 199 + }, + { + "epoch": 0.013935825523464446, + "grad_norm": 0.7303745750427693, + "learning_rate": 6.996981392890463e-07, + "loss": 1.5804, + "step": 200 + }, + { + "epoch": 0.014005504651081769, + "grad_norm": 0.6831734889505485, + "learning_rate": 6.996951136236786e-07, + "loss": 1.4779, + "step": 201 + }, + { + "epoch": 0.014075183778699091, + "grad_norm": 0.7347205886402175, + "learning_rate": 6.99692072877475e-07, + "loss": 1.5836, + "step": 202 + }, + { + "epoch": 0.014144862906316414, + "grad_norm": 0.7096195389620719, + "learning_rate": 6.996890170505813e-07, + "loss": 1.6197, + "step": 203 + }, + { + "epoch": 0.014214542033933734, + "grad_norm": 0.7562441052582111, + "learning_rate": 6.996859461431439e-07, + "loss": 1.7908, + "step": 204 + }, + { + "epoch": 0.014284221161551057, + "grad_norm": 0.7334066931518938, + "learning_rate": 6.996828601553098e-07, + "loss": 1.4389, + "step": 205 + }, + { + "epoch": 0.01435390028916838, + "grad_norm": 0.6840997752191323, + "learning_rate": 6.99679759087227e-07, + "loss": 1.5409, + "step": 206 + }, + { + "epoch": 0.014423579416785702, + "grad_norm": 0.7665481621203577, + "learning_rate": 6.996766429390443e-07, + "loss": 1.5991, + "step": 207 + }, + { + "epoch": 0.014493258544403025, + "grad_norm": 0.7817725118434202, + "learning_rate": 6.996735117109106e-07, + "loss": 1.5223, + "step": 208 + }, + { + "epoch": 0.014562937672020347, + "grad_norm": 0.7283839189183391, + "learning_rate": 6.996703654029764e-07, + "loss": 1.6684, + "step": 209 + }, + { + "epoch": 0.014632616799637668, + "grad_norm": 0.7032582333066247, + "learning_rate": 6.996672040153923e-07, + "loss": 1.5299, + "step": 210 + }, + { + "epoch": 0.01470229592725499, + "grad_norm": 0.8077693130167044, + "learning_rate": 6.996640275483096e-07, + "loss": 1.5585, + "step": 211 + }, + { + "epoch": 0.014771975054872313, + "grad_norm": 0.7057816760896902, + "learning_rate": 6.996608360018808e-07, + "loss": 1.6727, + "step": 212 + }, + { + "epoch": 0.014841654182489636, + "grad_norm": 0.7312558829705393, + "learning_rate": 6.996576293762589e-07, + "loss": 1.5828, + "step": 213 + }, + { + "epoch": 0.014911333310106958, + "grad_norm": 0.6892094634096153, + "learning_rate": 6.996544076715972e-07, + "loss": 1.5521, + "step": 214 + }, + { + "epoch": 0.014981012437724279, + "grad_norm": 0.7622336534357208, + "learning_rate": 6.996511708880504e-07, + "loss": 1.5927, + "step": 215 + }, + { + "epoch": 0.015050691565341601, + "grad_norm": 0.8615964785895364, + "learning_rate": 6.996479190257735e-07, + "loss": 1.495, + "step": 216 + }, + { + "epoch": 0.015120370692958924, + "grad_norm": 0.7461775817348781, + "learning_rate": 6.996446520849225e-07, + "loss": 1.6432, + "step": 217 + }, + { + "epoch": 0.015190049820576246, + "grad_norm": 0.702229537181909, + "learning_rate": 6.996413700656536e-07, + "loss": 1.5812, + "step": 218 + }, + { + "epoch": 0.015259728948193569, + "grad_norm": 0.7563790834731114, + "learning_rate": 6.996380729681244e-07, + "loss": 1.4811, + "step": 219 + }, + { + "epoch": 0.015329408075810892, + "grad_norm": 0.7242920709990179, + "learning_rate": 6.99634760792493e-07, + "loss": 1.4832, + "step": 220 + }, + { + "epoch": 0.015399087203428212, + "grad_norm": 0.7170579173498212, + "learning_rate": 6.996314335389177e-07, + "loss": 1.5585, + "step": 221 + }, + { + "epoch": 0.015468766331045535, + "grad_norm": 0.7477520743417225, + "learning_rate": 6.996280912075582e-07, + "loss": 1.6152, + "step": 222 + }, + { + "epoch": 0.015538445458662857, + "grad_norm": 0.7365668010013201, + "learning_rate": 6.996247337985746e-07, + "loss": 1.6056, + "step": 223 + }, + { + "epoch": 0.01560812458628018, + "grad_norm": 0.7407770775087316, + "learning_rate": 6.99621361312128e-07, + "loss": 1.6089, + "step": 224 + }, + { + "epoch": 0.0156778037138975, + "grad_norm": 0.7501162465150802, + "learning_rate": 6.996179737483797e-07, + "loss": 1.5875, + "step": 225 + }, + { + "epoch": 0.015747482841514823, + "grad_norm": 0.703304864155775, + "learning_rate": 6.996145711074923e-07, + "loss": 1.4987, + "step": 226 + }, + { + "epoch": 0.015817161969132146, + "grad_norm": 0.7613996009018047, + "learning_rate": 6.996111533896286e-07, + "loss": 1.517, + "step": 227 + }, + { + "epoch": 0.01588684109674947, + "grad_norm": 0.7295854207507647, + "learning_rate": 6.996077205949528e-07, + "loss": 1.5285, + "step": 228 + }, + { + "epoch": 0.01595652022436679, + "grad_norm": 0.7074237961088794, + "learning_rate": 6.99604272723629e-07, + "loss": 1.46, + "step": 229 + }, + { + "epoch": 0.016026199351984113, + "grad_norm": 0.7560396455970501, + "learning_rate": 6.996008097758227e-07, + "loss": 1.505, + "step": 230 + }, + { + "epoch": 0.016095878479601436, + "grad_norm": 0.7333373513299797, + "learning_rate": 6.995973317516995e-07, + "loss": 1.5871, + "step": 231 + }, + { + "epoch": 0.01616555760721876, + "grad_norm": 0.640181179700889, + "learning_rate": 6.995938386514265e-07, + "loss": 1.5224, + "step": 232 + }, + { + "epoch": 0.01623523673483608, + "grad_norm": 0.7101909631345903, + "learning_rate": 6.995903304751709e-07, + "loss": 1.6038, + "step": 233 + }, + { + "epoch": 0.016304915862453404, + "grad_norm": 0.7454843789810528, + "learning_rate": 6.995868072231007e-07, + "loss": 1.627, + "step": 234 + }, + { + "epoch": 0.016374594990070723, + "grad_norm": 0.7024591060628032, + "learning_rate": 6.99583268895385e-07, + "loss": 1.5807, + "step": 235 + }, + { + "epoch": 0.016444274117688045, + "grad_norm": 0.8070089400837613, + "learning_rate": 6.995797154921931e-07, + "loss": 1.5066, + "step": 236 + }, + { + "epoch": 0.016513953245305368, + "grad_norm": 0.7292465008262952, + "learning_rate": 6.995761470136955e-07, + "loss": 1.5627, + "step": 237 + }, + { + "epoch": 0.01658363237292269, + "grad_norm": 0.7678168219350536, + "learning_rate": 6.995725634600631e-07, + "loss": 1.6508, + "step": 238 + }, + { + "epoch": 0.016653311500540013, + "grad_norm": 0.7171811611510969, + "learning_rate": 6.995689648314677e-07, + "loss": 1.6866, + "step": 239 + }, + { + "epoch": 0.016722990628157335, + "grad_norm": 0.7466351681320509, + "learning_rate": 6.995653511280816e-07, + "loss": 1.6049, + "step": 240 + }, + { + "epoch": 0.016792669755774658, + "grad_norm": 0.7266320402780408, + "learning_rate": 6.995617223500782e-07, + "loss": 1.5321, + "step": 241 + }, + { + "epoch": 0.01686234888339198, + "grad_norm": 0.7063647038715658, + "learning_rate": 6.995580784976312e-07, + "loss": 1.5047, + "step": 242 + }, + { + "epoch": 0.016932028011009303, + "grad_norm": 0.7085394609516711, + "learning_rate": 6.995544195709153e-07, + "loss": 1.7102, + "step": 243 + }, + { + "epoch": 0.017001707138626625, + "grad_norm": 0.7046716168804864, + "learning_rate": 6.995507455701059e-07, + "loss": 1.4328, + "step": 244 + }, + { + "epoch": 0.017071386266243948, + "grad_norm": 0.72750562924709, + "learning_rate": 6.995470564953791e-07, + "loss": 1.5067, + "step": 245 + }, + { + "epoch": 0.01714106539386127, + "grad_norm": 0.6583564224352184, + "learning_rate": 6.995433523469114e-07, + "loss": 1.5941, + "step": 246 + }, + { + "epoch": 0.01721074452147859, + "grad_norm": 0.6829271380484534, + "learning_rate": 6.995396331248806e-07, + "loss": 1.5829, + "step": 247 + }, + { + "epoch": 0.017280423649095912, + "grad_norm": 0.7501970674237022, + "learning_rate": 6.99535898829465e-07, + "loss": 1.5741, + "step": 248 + }, + { + "epoch": 0.017350102776713235, + "grad_norm": 0.718589643719869, + "learning_rate": 6.995321494608432e-07, + "loss": 1.6546, + "step": 249 + }, + { + "epoch": 0.017419781904330557, + "grad_norm": 0.7018977730704544, + "learning_rate": 6.995283850191951e-07, + "loss": 1.6441, + "step": 250 + }, + { + "epoch": 0.01748946103194788, + "grad_norm": 0.7116814108818308, + "learning_rate": 6.99524605504701e-07, + "loss": 1.6247, + "step": 251 + }, + { + "epoch": 0.017559140159565202, + "grad_norm": 0.7417559961150321, + "learning_rate": 6.995208109175422e-07, + "loss": 1.6553, + "step": 252 + }, + { + "epoch": 0.017628819287182525, + "grad_norm": 0.7195473670622379, + "learning_rate": 6.995170012579004e-07, + "loss": 1.4587, + "step": 253 + }, + { + "epoch": 0.017698498414799847, + "grad_norm": 0.6889531952135118, + "learning_rate": 6.995131765259583e-07, + "loss": 1.6015, + "step": 254 + }, + { + "epoch": 0.01776817754241717, + "grad_norm": 0.6800146240575404, + "learning_rate": 6.995093367218991e-07, + "loss": 1.542, + "step": 255 + }, + { + "epoch": 0.017837856670034492, + "grad_norm": 0.706603053149328, + "learning_rate": 6.995054818459067e-07, + "loss": 1.516, + "step": 256 + }, + { + "epoch": 0.017907535797651815, + "grad_norm": 0.7258769121019596, + "learning_rate": 6.99501611898166e-07, + "loss": 1.5032, + "step": 257 + }, + { + "epoch": 0.017977214925269134, + "grad_norm": 0.7475511019702241, + "learning_rate": 6.994977268788624e-07, + "loss": 1.5867, + "step": 258 + }, + { + "epoch": 0.018046894052886456, + "grad_norm": 0.74299733714245, + "learning_rate": 6.99493826788182e-07, + "loss": 1.6865, + "step": 259 + }, + { + "epoch": 0.01811657318050378, + "grad_norm": 0.7066253885889029, + "learning_rate": 6.99489911626312e-07, + "loss": 1.6019, + "step": 260 + }, + { + "epoch": 0.0181862523081211, + "grad_norm": 0.8182502015789678, + "learning_rate": 6.994859813934395e-07, + "loss": 1.4875, + "step": 261 + }, + { + "epoch": 0.018255931435738424, + "grad_norm": 0.7488901193748735, + "learning_rate": 6.994820360897534e-07, + "loss": 1.5313, + "step": 262 + }, + { + "epoch": 0.018325610563355747, + "grad_norm": 0.7298797429645452, + "learning_rate": 6.994780757154423e-07, + "loss": 1.5985, + "step": 263 + }, + { + "epoch": 0.01839528969097307, + "grad_norm": 0.7494430286408182, + "learning_rate": 6.994741002706963e-07, + "loss": 1.6039, + "step": 264 + }, + { + "epoch": 0.01846496881859039, + "grad_norm": 0.749305407018932, + "learning_rate": 6.994701097557058e-07, + "loss": 1.6647, + "step": 265 + }, + { + "epoch": 0.018534647946207714, + "grad_norm": 0.7393237097632813, + "learning_rate": 6.99466104170662e-07, + "loss": 1.6921, + "step": 266 + }, + { + "epoch": 0.018604327073825037, + "grad_norm": 0.7052812834008583, + "learning_rate": 6.99462083515757e-07, + "loss": 1.4998, + "step": 267 + }, + { + "epoch": 0.01867400620144236, + "grad_norm": 0.7656850091837507, + "learning_rate": 6.994580477911834e-07, + "loss": 1.551, + "step": 268 + }, + { + "epoch": 0.018743685329059682, + "grad_norm": 0.7262479768296017, + "learning_rate": 6.994539969971345e-07, + "loss": 1.6432, + "step": 269 + }, + { + "epoch": 0.018813364456677, + "grad_norm": 0.6957600485001357, + "learning_rate": 6.994499311338046e-07, + "loss": 1.6025, + "step": 270 + }, + { + "epoch": 0.018883043584294323, + "grad_norm": 0.6905054997272053, + "learning_rate": 6.994458502013883e-07, + "loss": 1.4301, + "step": 271 + }, + { + "epoch": 0.018952722711911646, + "grad_norm": 0.7179445614004244, + "learning_rate": 6.994417542000813e-07, + "loss": 1.5864, + "step": 272 + }, + { + "epoch": 0.01902240183952897, + "grad_norm": 0.7072984855963963, + "learning_rate": 6.994376431300801e-07, + "loss": 1.6053, + "step": 273 + }, + { + "epoch": 0.01909208096714629, + "grad_norm": 0.7570107373874823, + "learning_rate": 6.994335169915813e-07, + "loss": 1.6383, + "step": 274 + }, + { + "epoch": 0.019161760094763614, + "grad_norm": 0.7274334966890454, + "learning_rate": 6.99429375784783e-07, + "loss": 1.625, + "step": 275 + }, + { + "epoch": 0.019231439222380936, + "grad_norm": 0.7563658771609096, + "learning_rate": 6.994252195098834e-07, + "loss": 1.4685, + "step": 276 + }, + { + "epoch": 0.01930111834999826, + "grad_norm": 0.6811578449534527, + "learning_rate": 6.994210481670817e-07, + "loss": 1.5123, + "step": 277 + }, + { + "epoch": 0.01937079747761558, + "grad_norm": 0.6786727534901147, + "learning_rate": 6.994168617565782e-07, + "loss": 1.4686, + "step": 278 + }, + { + "epoch": 0.019440476605232904, + "grad_norm": 0.7655364222476935, + "learning_rate": 6.994126602785729e-07, + "loss": 1.4576, + "step": 279 + }, + { + "epoch": 0.019510155732850226, + "grad_norm": 0.6821324813628526, + "learning_rate": 6.994084437332676e-07, + "loss": 1.5197, + "step": 280 + }, + { + "epoch": 0.019579834860467545, + "grad_norm": 0.7262970704517422, + "learning_rate": 6.99404212120864e-07, + "loss": 1.4957, + "step": 281 + }, + { + "epoch": 0.019649513988084868, + "grad_norm": 0.743676021736133, + "learning_rate": 6.993999654415654e-07, + "loss": 1.6085, + "step": 282 + }, + { + "epoch": 0.01971919311570219, + "grad_norm": 0.6378550410731294, + "learning_rate": 6.993957036955749e-07, + "loss": 1.602, + "step": 283 + }, + { + "epoch": 0.019788872243319513, + "grad_norm": 0.7246033604464897, + "learning_rate": 6.993914268830967e-07, + "loss": 1.5291, + "step": 284 + }, + { + "epoch": 0.019858551370936835, + "grad_norm": 0.6880276199996247, + "learning_rate": 6.99387135004336e-07, + "loss": 1.4173, + "step": 285 + }, + { + "epoch": 0.019928230498554158, + "grad_norm": 0.6947157355311014, + "learning_rate": 6.993828280594983e-07, + "loss": 1.6025, + "step": 286 + }, + { + "epoch": 0.01999790962617148, + "grad_norm": 0.7435927158927187, + "learning_rate": 6.993785060487901e-07, + "loss": 1.4261, + "step": 287 + }, + { + "epoch": 0.020067588753788803, + "grad_norm": 0.7142473424703729, + "learning_rate": 6.993741689724185e-07, + "loss": 1.5124, + "step": 288 + }, + { + "epoch": 0.020137267881406126, + "grad_norm": 0.7008072048840218, + "learning_rate": 6.993698168305914e-07, + "loss": 1.5411, + "step": 289 + }, + { + "epoch": 0.020206947009023448, + "grad_norm": 0.7435854653268873, + "learning_rate": 6.993654496235172e-07, + "loss": 1.6077, + "step": 290 + }, + { + "epoch": 0.02027662613664077, + "grad_norm": 0.725379113270835, + "learning_rate": 6.993610673514052e-07, + "loss": 1.5128, + "step": 291 + }, + { + "epoch": 0.020346305264258093, + "grad_norm": 0.7117406405072563, + "learning_rate": 6.993566700144656e-07, + "loss": 1.6271, + "step": 292 + }, + { + "epoch": 0.020415984391875412, + "grad_norm": 0.702802060197601, + "learning_rate": 6.99352257612909e-07, + "loss": 1.6355, + "step": 293 + }, + { + "epoch": 0.020485663519492735, + "grad_norm": 0.7416404216373786, + "learning_rate": 6.993478301469469e-07, + "loss": 1.5558, + "step": 294 + }, + { + "epoch": 0.020555342647110057, + "grad_norm": 6.474567925910475, + "learning_rate": 6.993433876167913e-07, + "loss": 1.6586, + "step": 295 + }, + { + "epoch": 0.02062502177472738, + "grad_norm": 0.7080837056054183, + "learning_rate": 6.993389300226553e-07, + "loss": 1.6812, + "step": 296 + }, + { + "epoch": 0.020694700902344702, + "grad_norm": 0.6653198180715081, + "learning_rate": 6.993344573647524e-07, + "loss": 1.4974, + "step": 297 + }, + { + "epoch": 0.020764380029962025, + "grad_norm": 0.7204476403446304, + "learning_rate": 6.99329969643297e-07, + "loss": 1.5675, + "step": 298 + }, + { + "epoch": 0.020834059157579347, + "grad_norm": 0.7445246032941663, + "learning_rate": 6.993254668585042e-07, + "loss": 1.5476, + "step": 299 + }, + { + "epoch": 0.02090373828519667, + "grad_norm": 0.7460425723778988, + "learning_rate": 6.993209490105897e-07, + "loss": 1.4898, + "step": 300 + }, + { + "epoch": 0.020973417412813992, + "grad_norm": 0.7157641517249568, + "learning_rate": 6.9931641609977e-07, + "loss": 1.5786, + "step": 301 + }, + { + "epoch": 0.021043096540431315, + "grad_norm": 0.7016023826923905, + "learning_rate": 6.993118681262625e-07, + "loss": 1.5981, + "step": 302 + }, + { + "epoch": 0.021112775668048638, + "grad_norm": 0.7382367733171915, + "learning_rate": 6.993073050902849e-07, + "loss": 1.5072, + "step": 303 + }, + { + "epoch": 0.021182454795665957, + "grad_norm": 0.7606065005113284, + "learning_rate": 6.99302726992056e-07, + "loss": 1.4621, + "step": 304 + }, + { + "epoch": 0.02125213392328328, + "grad_norm": 0.6813739822883382, + "learning_rate": 6.992981338317952e-07, + "loss": 1.4303, + "step": 305 + }, + { + "epoch": 0.0213218130509006, + "grad_norm": 9.560327004767403, + "learning_rate": 6.992935256097225e-07, + "loss": 1.6614, + "step": 306 + }, + { + "epoch": 0.021391492178517924, + "grad_norm": 0.7301189610598565, + "learning_rate": 6.992889023260589e-07, + "loss": 1.7052, + "step": 307 + }, + { + "epoch": 0.021461171306135247, + "grad_norm": 0.7407277030456509, + "learning_rate": 6.992842639810258e-07, + "loss": 1.6177, + "step": 308 + }, + { + "epoch": 0.02153085043375257, + "grad_norm": 0.6916162651462537, + "learning_rate": 6.992796105748455e-07, + "loss": 1.3961, + "step": 309 + }, + { + "epoch": 0.021600529561369892, + "grad_norm": 0.7318959820786545, + "learning_rate": 6.992749421077412e-07, + "loss": 1.6585, + "step": 310 + }, + { + "epoch": 0.021670208688987214, + "grad_norm": 0.7366437888923923, + "learning_rate": 6.992702585799365e-07, + "loss": 1.6567, + "step": 311 + }, + { + "epoch": 0.021739887816604537, + "grad_norm": 0.7091001881005623, + "learning_rate": 6.992655599916557e-07, + "loss": 1.4418, + "step": 312 + }, + { + "epoch": 0.02180956694422186, + "grad_norm": 0.7275485003563671, + "learning_rate": 6.992608463431242e-07, + "loss": 1.5372, + "step": 313 + }, + { + "epoch": 0.021879246071839182, + "grad_norm": 0.6704285285994563, + "learning_rate": 6.992561176345678e-07, + "loss": 1.4605, + "step": 314 + }, + { + "epoch": 0.021948925199456504, + "grad_norm": 0.7125544818193529, + "learning_rate": 6.99251373866213e-07, + "loss": 1.5262, + "step": 315 + }, + { + "epoch": 0.022018604327073824, + "grad_norm": 0.7509609338377493, + "learning_rate": 6.992466150382873e-07, + "loss": 1.5833, + "step": 316 + }, + { + "epoch": 0.022088283454691146, + "grad_norm": 0.747340093745921, + "learning_rate": 6.992418411510185e-07, + "loss": 1.6504, + "step": 317 + }, + { + "epoch": 0.02215796258230847, + "grad_norm": 0.7046299436255133, + "learning_rate": 6.992370522046357e-07, + "loss": 1.4034, + "step": 318 + }, + { + "epoch": 0.02222764170992579, + "grad_norm": 0.7257162097711396, + "learning_rate": 6.992322481993683e-07, + "loss": 1.5848, + "step": 319 + }, + { + "epoch": 0.022297320837543114, + "grad_norm": 0.6691152644528866, + "learning_rate": 6.992274291354463e-07, + "loss": 1.4598, + "step": 320 + }, + { + "epoch": 0.022366999965160436, + "grad_norm": 0.7457614871946824, + "learning_rate": 6.992225950131009e-07, + "loss": 1.5387, + "step": 321 + }, + { + "epoch": 0.02243667909277776, + "grad_norm": 0.689307701631239, + "learning_rate": 6.992177458325635e-07, + "loss": 1.5808, + "step": 322 + }, + { + "epoch": 0.02250635822039508, + "grad_norm": 0.7335991464373063, + "learning_rate": 6.992128815940668e-07, + "loss": 1.6553, + "step": 323 + }, + { + "epoch": 0.022576037348012404, + "grad_norm": 0.6899224219200941, + "learning_rate": 6.992080022978437e-07, + "loss": 1.5265, + "step": 324 + }, + { + "epoch": 0.022645716475629726, + "grad_norm": 0.6806245477363745, + "learning_rate": 6.992031079441282e-07, + "loss": 1.4794, + "step": 325 + }, + { + "epoch": 0.02271539560324705, + "grad_norm": 0.8141536466407525, + "learning_rate": 6.991981985331546e-07, + "loss": 1.5921, + "step": 326 + }, + { + "epoch": 0.022785074730864368, + "grad_norm": 0.6788755156136146, + "learning_rate": 6.991932740651582e-07, + "loss": 1.5735, + "step": 327 + }, + { + "epoch": 0.02285475385848169, + "grad_norm": 0.7272500675836373, + "learning_rate": 6.991883345403751e-07, + "loss": 1.55, + "step": 328 + }, + { + "epoch": 0.022924432986099013, + "grad_norm": 0.7009298523819532, + "learning_rate": 6.991833799590422e-07, + "loss": 1.4261, + "step": 329 + }, + { + "epoch": 0.022994112113716336, + "grad_norm": 0.6948863685694087, + "learning_rate": 6.991784103213965e-07, + "loss": 1.5659, + "step": 330 + }, + { + "epoch": 0.023063791241333658, + "grad_norm": 0.7063327387262183, + "learning_rate": 6.991734256276766e-07, + "loss": 1.5343, + "step": 331 + }, + { + "epoch": 0.02313347036895098, + "grad_norm": 0.7099461836572424, + "learning_rate": 6.99168425878121e-07, + "loss": 1.5125, + "step": 332 + }, + { + "epoch": 0.023203149496568303, + "grad_norm": 0.7796603493076472, + "learning_rate": 6.991634110729694e-07, + "loss": 1.6092, + "step": 333 + }, + { + "epoch": 0.023272828624185626, + "grad_norm": 0.7296219083016224, + "learning_rate": 6.991583812124623e-07, + "loss": 1.6215, + "step": 334 + }, + { + "epoch": 0.023342507751802948, + "grad_norm": 0.72574929831423, + "learning_rate": 6.991533362968406e-07, + "loss": 1.6325, + "step": 335 + }, + { + "epoch": 0.02341218687942027, + "grad_norm": 0.7019432899693439, + "learning_rate": 6.99148276326346e-07, + "loss": 1.5221, + "step": 336 + }, + { + "epoch": 0.023481866007037593, + "grad_norm": 0.7492264986012627, + "learning_rate": 6.991432013012211e-07, + "loss": 1.6158, + "step": 337 + }, + { + "epoch": 0.023551545134654916, + "grad_norm": 0.692588523766954, + "learning_rate": 6.991381112217092e-07, + "loss": 1.5078, + "step": 338 + }, + { + "epoch": 0.023621224262272235, + "grad_norm": 0.6981979538678789, + "learning_rate": 6.991330060880539e-07, + "loss": 1.6101, + "step": 339 + }, + { + "epoch": 0.023690903389889557, + "grad_norm": 0.7194235046945815, + "learning_rate": 6.991278859005003e-07, + "loss": 1.5601, + "step": 340 + }, + { + "epoch": 0.02376058251750688, + "grad_norm": 0.7420959635718415, + "learning_rate": 6.991227506592932e-07, + "loss": 1.6685, + "step": 341 + }, + { + "epoch": 0.023830261645124202, + "grad_norm": 0.7089125015154109, + "learning_rate": 6.991176003646792e-07, + "loss": 1.6257, + "step": 342 + }, + { + "epoch": 0.023899940772741525, + "grad_norm": 0.7210440382167659, + "learning_rate": 6.991124350169049e-07, + "loss": 1.5857, + "step": 343 + }, + { + "epoch": 0.023969619900358848, + "grad_norm": 0.7827481182453416, + "learning_rate": 6.991072546162178e-07, + "loss": 1.6987, + "step": 344 + }, + { + "epoch": 0.02403929902797617, + "grad_norm": 0.7395681382788681, + "learning_rate": 6.991020591628663e-07, + "loss": 1.6111, + "step": 345 + }, + { + "epoch": 0.024108978155593493, + "grad_norm": 0.800887301847492, + "learning_rate": 6.990968486570993e-07, + "loss": 1.5495, + "step": 346 + }, + { + "epoch": 0.024178657283210815, + "grad_norm": 0.7185838722342582, + "learning_rate": 6.990916230991663e-07, + "loss": 1.6471, + "step": 347 + }, + { + "epoch": 0.024248336410828138, + "grad_norm": 0.6988928940290781, + "learning_rate": 6.990863824893181e-07, + "loss": 1.5798, + "step": 348 + }, + { + "epoch": 0.02431801553844546, + "grad_norm": 0.7670273052980163, + "learning_rate": 6.990811268278056e-07, + "loss": 1.6012, + "step": 349 + }, + { + "epoch": 0.02438769466606278, + "grad_norm": 0.6956525844641243, + "learning_rate": 6.990758561148806e-07, + "loss": 1.6228, + "step": 350 + }, + { + "epoch": 0.024457373793680102, + "grad_norm": 0.7380720675968462, + "learning_rate": 6.990705703507959e-07, + "loss": 1.6031, + "step": 351 + }, + { + "epoch": 0.024527052921297424, + "grad_norm": 0.7351909737787244, + "learning_rate": 6.990652695358046e-07, + "loss": 1.5208, + "step": 352 + }, + { + "epoch": 0.024596732048914747, + "grad_norm": 0.740389363327584, + "learning_rate": 6.990599536701608e-07, + "loss": 1.5537, + "step": 353 + }, + { + "epoch": 0.02466641117653207, + "grad_norm": 0.8039919377703592, + "learning_rate": 6.990546227541194e-07, + "loss": 1.7098, + "step": 354 + }, + { + "epoch": 0.024736090304149392, + "grad_norm": 0.7304226595911018, + "learning_rate": 6.990492767879357e-07, + "loss": 1.605, + "step": 355 + }, + { + "epoch": 0.024805769431766714, + "grad_norm": 0.741024497237045, + "learning_rate": 6.990439157718658e-07, + "loss": 1.5043, + "step": 356 + }, + { + "epoch": 0.024875448559384037, + "grad_norm": 0.7024368053127528, + "learning_rate": 6.990385397061669e-07, + "loss": 1.5534, + "step": 357 + }, + { + "epoch": 0.02494512768700136, + "grad_norm": 0.7323425435804326, + "learning_rate": 6.990331485910965e-07, + "loss": 1.6298, + "step": 358 + }, + { + "epoch": 0.025014806814618682, + "grad_norm": 0.7290774888294044, + "learning_rate": 6.990277424269127e-07, + "loss": 1.4802, + "step": 359 + }, + { + "epoch": 0.025084485942236005, + "grad_norm": 0.6658582153329362, + "learning_rate": 6.990223212138749e-07, + "loss": 1.5377, + "step": 360 + }, + { + "epoch": 0.025154165069853327, + "grad_norm": 0.739800561775487, + "learning_rate": 6.990168849522429e-07, + "loss": 1.5655, + "step": 361 + }, + { + "epoch": 0.025223844197470646, + "grad_norm": 0.6911945679125883, + "learning_rate": 6.99011433642277e-07, + "loss": 1.6444, + "step": 362 + }, + { + "epoch": 0.02529352332508797, + "grad_norm": 0.7460442162123894, + "learning_rate": 6.990059672842386e-07, + "loss": 1.4954, + "step": 363 + }, + { + "epoch": 0.02536320245270529, + "grad_norm": 0.8406791490688356, + "learning_rate": 6.990004858783895e-07, + "loss": 1.683, + "step": 364 + }, + { + "epoch": 0.025432881580322614, + "grad_norm": 0.6825887722241889, + "learning_rate": 6.989949894249926e-07, + "loss": 1.4904, + "step": 365 + }, + { + "epoch": 0.025502560707939936, + "grad_norm": 0.734984485601963, + "learning_rate": 6.989894779243111e-07, + "loss": 1.57, + "step": 366 + }, + { + "epoch": 0.02557223983555726, + "grad_norm": 0.802220464323417, + "learning_rate": 6.989839513766093e-07, + "loss": 1.5195, + "step": 367 + }, + { + "epoch": 0.02564191896317458, + "grad_norm": 0.7037489151381637, + "learning_rate": 6.989784097821519e-07, + "loss": 1.4717, + "step": 368 + }, + { + "epoch": 0.025711598090791904, + "grad_norm": 0.7343259525355395, + "learning_rate": 6.989728531412046e-07, + "loss": 1.5571, + "step": 369 + }, + { + "epoch": 0.025781277218409226, + "grad_norm": 0.6747397697253835, + "learning_rate": 6.989672814540335e-07, + "loss": 1.5264, + "step": 370 + }, + { + "epoch": 0.02585095634602655, + "grad_norm": 0.7568976619670307, + "learning_rate": 6.989616947209057e-07, + "loss": 1.6249, + "step": 371 + }, + { + "epoch": 0.02592063547364387, + "grad_norm": 0.6759327486428545, + "learning_rate": 6.98956092942089e-07, + "loss": 1.5019, + "step": 372 + }, + { + "epoch": 0.02599031460126119, + "grad_norm": 0.709291267177882, + "learning_rate": 6.989504761178519e-07, + "loss": 1.5009, + "step": 373 + }, + { + "epoch": 0.026059993728878513, + "grad_norm": 0.7099006099654483, + "learning_rate": 6.989448442484632e-07, + "loss": 1.5475, + "step": 374 + }, + { + "epoch": 0.026129672856495836, + "grad_norm": 0.690112841906305, + "learning_rate": 6.989391973341932e-07, + "loss": 1.4402, + "step": 375 + }, + { + "epoch": 0.026199351984113158, + "grad_norm": 0.7427387751223772, + "learning_rate": 6.989335353753122e-07, + "loss": 1.5491, + "step": 376 + }, + { + "epoch": 0.02626903111173048, + "grad_norm": 0.6770409349734714, + "learning_rate": 6.98927858372092e-07, + "loss": 1.4618, + "step": 377 + }, + { + "epoch": 0.026338710239347803, + "grad_norm": 0.7358590438958797, + "learning_rate": 6.989221663248041e-07, + "loss": 1.5153, + "step": 378 + }, + { + "epoch": 0.026408389366965126, + "grad_norm": 0.6659329097898178, + "learning_rate": 6.989164592337216e-07, + "loss": 1.5076, + "step": 379 + }, + { + "epoch": 0.02647806849458245, + "grad_norm": 0.6891744971335575, + "learning_rate": 6.989107370991179e-07, + "loss": 1.6146, + "step": 380 + }, + { + "epoch": 0.02654774762219977, + "grad_norm": 0.7550006904638933, + "learning_rate": 6.989049999212671e-07, + "loss": 1.5802, + "step": 381 + }, + { + "epoch": 0.026617426749817093, + "grad_norm": 0.7002665501804611, + "learning_rate": 6.988992477004446e-07, + "loss": 1.5355, + "step": 382 + }, + { + "epoch": 0.026687105877434416, + "grad_norm": 0.7009921240154263, + "learning_rate": 6.988934804369254e-07, + "loss": 1.5517, + "step": 383 + }, + { + "epoch": 0.02675678500505174, + "grad_norm": 0.7407286371939378, + "learning_rate": 6.988876981309864e-07, + "loss": 1.616, + "step": 384 + }, + { + "epoch": 0.026826464132669058, + "grad_norm": 0.7137375904069622, + "learning_rate": 6.988819007829045e-07, + "loss": 1.569, + "step": 385 + }, + { + "epoch": 0.02689614326028638, + "grad_norm": 0.6968322471953381, + "learning_rate": 6.988760883929575e-07, + "loss": 1.4872, + "step": 386 + }, + { + "epoch": 0.026965822387903703, + "grad_norm": 0.7335852919010041, + "learning_rate": 6.988702609614239e-07, + "loss": 1.5747, + "step": 387 + }, + { + "epoch": 0.027035501515521025, + "grad_norm": 0.7563187176836982, + "learning_rate": 6.98864418488583e-07, + "loss": 1.6437, + "step": 388 + }, + { + "epoch": 0.027105180643138348, + "grad_norm": 0.7615857240277533, + "learning_rate": 6.988585609747149e-07, + "loss": 1.6221, + "step": 389 + }, + { + "epoch": 0.02717485977075567, + "grad_norm": 0.7603071022901849, + "learning_rate": 6.988526884201002e-07, + "loss": 1.6949, + "step": 390 + }, + { + "epoch": 0.027244538898372993, + "grad_norm": 0.6968767287992647, + "learning_rate": 6.988468008250203e-07, + "loss": 1.6981, + "step": 391 + }, + { + "epoch": 0.027314218025990315, + "grad_norm": 0.7112654333278738, + "learning_rate": 6.988408981897575e-07, + "loss": 1.5906, + "step": 392 + }, + { + "epoch": 0.027383897153607638, + "grad_norm": 0.7298025420013617, + "learning_rate": 6.988349805145946e-07, + "loss": 1.466, + "step": 393 + }, + { + "epoch": 0.02745357628122496, + "grad_norm": 0.710177304695451, + "learning_rate": 6.98829047799815e-07, + "loss": 1.6849, + "step": 394 + }, + { + "epoch": 0.027523255408842283, + "grad_norm": 0.757594638471493, + "learning_rate": 6.988231000457031e-07, + "loss": 1.5379, + "step": 395 + }, + { + "epoch": 0.027592934536459602, + "grad_norm": 0.666881395058542, + "learning_rate": 6.98817137252544e-07, + "loss": 1.3365, + "step": 396 + }, + { + "epoch": 0.027662613664076924, + "grad_norm": 0.7591578570938375, + "learning_rate": 6.988111594206236e-07, + "loss": 1.621, + "step": 397 + }, + { + "epoch": 0.027732292791694247, + "grad_norm": 0.7175961360945237, + "learning_rate": 6.988051665502281e-07, + "loss": 1.6332, + "step": 398 + }, + { + "epoch": 0.02780197191931157, + "grad_norm": 0.6804206351265298, + "learning_rate": 6.987991586416447e-07, + "loss": 1.5439, + "step": 399 + }, + { + "epoch": 0.027871651046928892, + "grad_norm": 0.7399865885593209, + "learning_rate": 6.987931356951616e-07, + "loss": 1.6571, + "step": 400 + }, + { + "epoch": 0.027941330174546215, + "grad_norm": 0.6659744696681877, + "learning_rate": 6.987870977110671e-07, + "loss": 1.4773, + "step": 401 + }, + { + "epoch": 0.028011009302163537, + "grad_norm": 0.7512620715969653, + "learning_rate": 6.987810446896507e-07, + "loss": 1.473, + "step": 402 + }, + { + "epoch": 0.02808068842978086, + "grad_norm": 0.6700099927086145, + "learning_rate": 6.987749766312023e-07, + "loss": 1.4796, + "step": 403 + }, + { + "epoch": 0.028150367557398182, + "grad_norm": 0.7135736477251078, + "learning_rate": 6.98768893536013e-07, + "loss": 1.5236, + "step": 404 + }, + { + "epoch": 0.028220046685015505, + "grad_norm": 0.6892839109346525, + "learning_rate": 6.987627954043743e-07, + "loss": 1.6111, + "step": 405 + }, + { + "epoch": 0.028289725812632827, + "grad_norm": 0.6767335993021136, + "learning_rate": 6.987566822365781e-07, + "loss": 1.5699, + "step": 406 + }, + { + "epoch": 0.028359404940250146, + "grad_norm": 0.7433392074067198, + "learning_rate": 6.987505540329176e-07, + "loss": 1.4527, + "step": 407 + }, + { + "epoch": 0.02842908406786747, + "grad_norm": 0.6556767958241513, + "learning_rate": 6.987444107936865e-07, + "loss": 1.6098, + "step": 408 + }, + { + "epoch": 0.02849876319548479, + "grad_norm": 0.7215818443504534, + "learning_rate": 6.987382525191791e-07, + "loss": 1.4621, + "step": 409 + }, + { + "epoch": 0.028568442323102114, + "grad_norm": 0.7477993104746903, + "learning_rate": 6.987320792096907e-07, + "loss": 1.6124, + "step": 410 + }, + { + "epoch": 0.028638121450719436, + "grad_norm": 0.7201733172736332, + "learning_rate": 6.987258908655168e-07, + "loss": 1.4661, + "step": 411 + }, + { + "epoch": 0.02870780057833676, + "grad_norm": 0.72337434724977, + "learning_rate": 6.987196874869541e-07, + "loss": 1.8324, + "step": 412 + }, + { + "epoch": 0.02877747970595408, + "grad_norm": 0.7097137381375397, + "learning_rate": 6.987134690743e-07, + "loss": 1.5459, + "step": 413 + }, + { + "epoch": 0.028847158833571404, + "grad_norm": 0.7173005147308623, + "learning_rate": 6.987072356278523e-07, + "loss": 1.4824, + "step": 414 + }, + { + "epoch": 0.028916837961188727, + "grad_norm": 0.7879220907659518, + "learning_rate": 6.987009871479101e-07, + "loss": 1.5851, + "step": 415 + }, + { + "epoch": 0.02898651708880605, + "grad_norm": 0.6911412058391415, + "learning_rate": 6.986947236347724e-07, + "loss": 1.4977, + "step": 416 + }, + { + "epoch": 0.02905619621642337, + "grad_norm": 0.7012733331107831, + "learning_rate": 6.986884450887396e-07, + "loss": 1.565, + "step": 417 + }, + { + "epoch": 0.029125875344040694, + "grad_norm": 0.7840924933610383, + "learning_rate": 6.986821515101124e-07, + "loss": 1.55, + "step": 418 + }, + { + "epoch": 0.029195554471658013, + "grad_norm": 0.6964388522029603, + "learning_rate": 6.986758428991927e-07, + "loss": 1.6359, + "step": 419 + }, + { + "epoch": 0.029265233599275336, + "grad_norm": 0.7336109790356905, + "learning_rate": 6.986695192562826e-07, + "loss": 1.5707, + "step": 420 + }, + { + "epoch": 0.02933491272689266, + "grad_norm": 0.7719512610826261, + "learning_rate": 6.986631805816851e-07, + "loss": 1.6261, + "step": 421 + }, + { + "epoch": 0.02940459185450998, + "grad_norm": 0.7301564052254476, + "learning_rate": 6.986568268757041e-07, + "loss": 1.5733, + "step": 422 + }, + { + "epoch": 0.029474270982127303, + "grad_norm": 0.7533907159721789, + "learning_rate": 6.98650458138644e-07, + "loss": 1.4902, + "step": 423 + }, + { + "epoch": 0.029543950109744626, + "grad_norm": 0.7073959569566014, + "learning_rate": 6.986440743708101e-07, + "loss": 1.5164, + "step": 424 + }, + { + "epoch": 0.02961362923736195, + "grad_norm": 0.7145941743912411, + "learning_rate": 6.986376755725082e-07, + "loss": 1.6115, + "step": 425 + }, + { + "epoch": 0.02968330836497927, + "grad_norm": 0.6923721591358505, + "learning_rate": 6.98631261744045e-07, + "loss": 1.5826, + "step": 426 + }, + { + "epoch": 0.029752987492596594, + "grad_norm": 0.6839320727296647, + "learning_rate": 6.986248328857279e-07, + "loss": 1.4239, + "step": 427 + }, + { + "epoch": 0.029822666620213916, + "grad_norm": 0.7210066398143022, + "learning_rate": 6.986183889978649e-07, + "loss": 1.6437, + "step": 428 + }, + { + "epoch": 0.02989234574783124, + "grad_norm": 0.7008115162017041, + "learning_rate": 6.98611930080765e-07, + "loss": 1.5053, + "step": 429 + }, + { + "epoch": 0.029962024875448558, + "grad_norm": 0.6942891542494432, + "learning_rate": 6.986054561347374e-07, + "loss": 1.5146, + "step": 430 + }, + { + "epoch": 0.03003170400306588, + "grad_norm": 0.6635542554591299, + "learning_rate": 6.985989671600925e-07, + "loss": 1.4483, + "step": 431 + }, + { + "epoch": 0.030101383130683203, + "grad_norm": 0.7115892340457141, + "learning_rate": 6.985924631571414e-07, + "loss": 1.5982, + "step": 432 + }, + { + "epoch": 0.030171062258300525, + "grad_norm": 0.6854278325150915, + "learning_rate": 6.985859441261956e-07, + "loss": 1.437, + "step": 433 + }, + { + "epoch": 0.030240741385917848, + "grad_norm": 0.6759620583516114, + "learning_rate": 6.985794100675676e-07, + "loss": 1.5435, + "step": 434 + }, + { + "epoch": 0.03031042051353517, + "grad_norm": 0.6933685069524756, + "learning_rate": 6.985728609815706e-07, + "loss": 1.6306, + "step": 435 + }, + { + "epoch": 0.030380099641152493, + "grad_norm": 0.7146245192057525, + "learning_rate": 6.985662968685184e-07, + "loss": 1.6303, + "step": 436 + }, + { + "epoch": 0.030449778768769815, + "grad_norm": 0.7232446855847888, + "learning_rate": 6.985597177287253e-07, + "loss": 1.5931, + "step": 437 + }, + { + "epoch": 0.030519457896387138, + "grad_norm": 0.7184645069885551, + "learning_rate": 6.985531235625069e-07, + "loss": 1.5508, + "step": 438 + }, + { + "epoch": 0.03058913702400446, + "grad_norm": 0.7214958357639737, + "learning_rate": 6.98546514370179e-07, + "loss": 1.4922, + "step": 439 + }, + { + "epoch": 0.030658816151621783, + "grad_norm": 0.6782468340100151, + "learning_rate": 6.985398901520586e-07, + "loss": 1.5012, + "step": 440 + }, + { + "epoch": 0.030728495279239106, + "grad_norm": 0.7311905689952574, + "learning_rate": 6.985332509084629e-07, + "loss": 1.5831, + "step": 441 + }, + { + "epoch": 0.030798174406856425, + "grad_norm": 0.698328027036214, + "learning_rate": 6.985265966397102e-07, + "loss": 1.4995, + "step": 442 + }, + { + "epoch": 0.030867853534473747, + "grad_norm": 0.7368612541969222, + "learning_rate": 6.985199273461193e-07, + "loss": 1.4867, + "step": 443 + }, + { + "epoch": 0.03093753266209107, + "grad_norm": 0.7327398721151145, + "learning_rate": 6.985132430280098e-07, + "loss": 1.5259, + "step": 444 + }, + { + "epoch": 0.031007211789708392, + "grad_norm": 0.7331724114655763, + "learning_rate": 6.985065436857021e-07, + "loss": 1.5916, + "step": 445 + }, + { + "epoch": 0.031076890917325715, + "grad_norm": 0.7182699009047467, + "learning_rate": 6.984998293195171e-07, + "loss": 1.654, + "step": 446 + }, + { + "epoch": 0.031146570044943037, + "grad_norm": 0.6931250303313554, + "learning_rate": 6.984930999297769e-07, + "loss": 1.4452, + "step": 447 + }, + { + "epoch": 0.03121624917256036, + "grad_norm": 0.7721643708509517, + "learning_rate": 6.984863555168036e-07, + "loss": 1.5864, + "step": 448 + }, + { + "epoch": 0.03128592830017768, + "grad_norm": 0.7545867751828237, + "learning_rate": 6.984795960809205e-07, + "loss": 1.5629, + "step": 449 + }, + { + "epoch": 0.031355607427795, + "grad_norm": 0.7269873867725736, + "learning_rate": 6.984728216224517e-07, + "loss": 1.6225, + "step": 450 + }, + { + "epoch": 0.031425286555412324, + "grad_norm": 0.722602283050896, + "learning_rate": 6.984660321417218e-07, + "loss": 1.4438, + "step": 451 + }, + { + "epoch": 0.031494965683029646, + "grad_norm": 0.7384727981711023, + "learning_rate": 6.98459227639056e-07, + "loss": 1.5485, + "step": 452 + }, + { + "epoch": 0.03156464481064697, + "grad_norm": 0.7018161474292629, + "learning_rate": 6.984524081147805e-07, + "loss": 1.5858, + "step": 453 + }, + { + "epoch": 0.03163432393826429, + "grad_norm": 0.7169485562565262, + "learning_rate": 6.984455735692222e-07, + "loss": 1.6247, + "step": 454 + }, + { + "epoch": 0.031704003065881614, + "grad_norm": 0.7308662225701769, + "learning_rate": 6.984387240027084e-07, + "loss": 1.5516, + "step": 455 + }, + { + "epoch": 0.03177368219349894, + "grad_norm": 0.7209362918239302, + "learning_rate": 6.984318594155676e-07, + "loss": 1.4468, + "step": 456 + }, + { + "epoch": 0.03184336132111626, + "grad_norm": 0.7890065268272298, + "learning_rate": 6.984249798081286e-07, + "loss": 1.5742, + "step": 457 + }, + { + "epoch": 0.03191304044873358, + "grad_norm": 0.7443926307266672, + "learning_rate": 6.98418085180721e-07, + "loss": 1.5981, + "step": 458 + }, + { + "epoch": 0.031982719576350904, + "grad_norm": 0.7686644197473939, + "learning_rate": 6.984111755336755e-07, + "loss": 1.5131, + "step": 459 + }, + { + "epoch": 0.03205239870396823, + "grad_norm": 0.7515630377431824, + "learning_rate": 6.984042508673228e-07, + "loss": 1.5627, + "step": 460 + }, + { + "epoch": 0.03212207783158555, + "grad_norm": 0.8071194750966978, + "learning_rate": 6.983973111819951e-07, + "loss": 1.5605, + "step": 461 + }, + { + "epoch": 0.03219175695920287, + "grad_norm": 0.7068935640984225, + "learning_rate": 6.98390356478025e-07, + "loss": 1.5796, + "step": 462 + }, + { + "epoch": 0.032261436086820194, + "grad_norm": 0.7079434375614534, + "learning_rate": 6.983833867557455e-07, + "loss": 1.544, + "step": 463 + }, + { + "epoch": 0.03233111521443752, + "grad_norm": 0.6875118632785433, + "learning_rate": 6.983764020154909e-07, + "loss": 1.5442, + "step": 464 + }, + { + "epoch": 0.03240079434205484, + "grad_norm": 0.7374325063772078, + "learning_rate": 6.983694022575955e-07, + "loss": 1.4734, + "step": 465 + }, + { + "epoch": 0.03247047346967216, + "grad_norm": 0.68601080519657, + "learning_rate": 6.983623874823952e-07, + "loss": 1.4625, + "step": 466 + }, + { + "epoch": 0.032540152597289485, + "grad_norm": 0.7015382118886788, + "learning_rate": 6.983553576902259e-07, + "loss": 1.4303, + "step": 467 + }, + { + "epoch": 0.03260983172490681, + "grad_norm": 0.7230971366494634, + "learning_rate": 6.983483128814246e-07, + "loss": 1.5003, + "step": 468 + }, + { + "epoch": 0.03267951085252413, + "grad_norm": 0.7052403587790288, + "learning_rate": 6.983412530563287e-07, + "loss": 1.5526, + "step": 469 + }, + { + "epoch": 0.032749189980141445, + "grad_norm": 0.7742076645158189, + "learning_rate": 6.98334178215277e-07, + "loss": 1.516, + "step": 470 + }, + { + "epoch": 0.03281886910775877, + "grad_norm": 0.7818787556440743, + "learning_rate": 6.983270883586081e-07, + "loss": 1.6288, + "step": 471 + }, + { + "epoch": 0.03288854823537609, + "grad_norm": 0.686648879518541, + "learning_rate": 6.983199834866616e-07, + "loss": 1.5191, + "step": 472 + }, + { + "epoch": 0.03295822736299341, + "grad_norm": 0.75638490928715, + "learning_rate": 6.983128635997785e-07, + "loss": 1.6619, + "step": 473 + }, + { + "epoch": 0.033027906490610735, + "grad_norm": 0.6944982743137997, + "learning_rate": 6.983057286982998e-07, + "loss": 1.504, + "step": 474 + }, + { + "epoch": 0.03309758561822806, + "grad_norm": 0.7297694980053806, + "learning_rate": 6.982985787825673e-07, + "loss": 1.7033, + "step": 475 + }, + { + "epoch": 0.03316726474584538, + "grad_norm": 0.7814280477752864, + "learning_rate": 6.982914138529237e-07, + "loss": 1.6998, + "step": 476 + }, + { + "epoch": 0.0332369438734627, + "grad_norm": 0.7592691279743546, + "learning_rate": 6.982842339097124e-07, + "loss": 1.6168, + "step": 477 + }, + { + "epoch": 0.033306623001080025, + "grad_norm": 0.6950822644790459, + "learning_rate": 6.982770389532773e-07, + "loss": 1.6014, + "step": 478 + }, + { + "epoch": 0.03337630212869735, + "grad_norm": 0.8258391482075685, + "learning_rate": 6.982698289839635e-07, + "loss": 1.6586, + "step": 479 + }, + { + "epoch": 0.03344598125631467, + "grad_norm": 0.6971077588869111, + "learning_rate": 6.982626040021164e-07, + "loss": 1.5408, + "step": 480 + }, + { + "epoch": 0.03351566038393199, + "grad_norm": 0.7228631898409523, + "learning_rate": 6.982553640080821e-07, + "loss": 1.5811, + "step": 481 + }, + { + "epoch": 0.033585339511549316, + "grad_norm": 0.7282650250649347, + "learning_rate": 6.982481090022077e-07, + "loss": 1.6308, + "step": 482 + }, + { + "epoch": 0.03365501863916664, + "grad_norm": 0.7363507876647463, + "learning_rate": 6.982408389848408e-07, + "loss": 1.6058, + "step": 483 + }, + { + "epoch": 0.03372469776678396, + "grad_norm": 0.7304548735709389, + "learning_rate": 6.982335539563299e-07, + "loss": 1.5704, + "step": 484 + }, + { + "epoch": 0.03379437689440128, + "grad_norm": 0.7182208438930545, + "learning_rate": 6.98226253917024e-07, + "loss": 1.559, + "step": 485 + }, + { + "epoch": 0.033864056022018606, + "grad_norm": 0.7095183394575739, + "learning_rate": 6.982189388672729e-07, + "loss": 1.5234, + "step": 486 + }, + { + "epoch": 0.03393373514963593, + "grad_norm": 0.7100337849855409, + "learning_rate": 6.982116088074274e-07, + "loss": 1.4691, + "step": 487 + }, + { + "epoch": 0.03400341427725325, + "grad_norm": 0.7187486302555179, + "learning_rate": 6.982042637378384e-07, + "loss": 1.4962, + "step": 488 + }, + { + "epoch": 0.03407309340487057, + "grad_norm": 0.738433028371621, + "learning_rate": 6.981969036588582e-07, + "loss": 1.5137, + "step": 489 + }, + { + "epoch": 0.034142772532487896, + "grad_norm": 0.7494164590053654, + "learning_rate": 6.981895285708394e-07, + "loss": 1.6279, + "step": 490 + }, + { + "epoch": 0.03421245166010522, + "grad_norm": 0.6714113417696089, + "learning_rate": 6.981821384741353e-07, + "loss": 1.5603, + "step": 491 + }, + { + "epoch": 0.03428213078772254, + "grad_norm": 0.7347104034043285, + "learning_rate": 6.981747333691003e-07, + "loss": 1.6593, + "step": 492 + }, + { + "epoch": 0.034351809915339857, + "grad_norm": 0.7315383543287005, + "learning_rate": 6.981673132560891e-07, + "loss": 1.6402, + "step": 493 + }, + { + "epoch": 0.03442148904295718, + "grad_norm": 0.7971154217795332, + "learning_rate": 6.981598781354574e-07, + "loss": 1.8423, + "step": 494 + }, + { + "epoch": 0.0344911681705745, + "grad_norm": 0.7651575324005175, + "learning_rate": 6.981524280075613e-07, + "loss": 1.4978, + "step": 495 + }, + { + "epoch": 0.034560847298191824, + "grad_norm": 0.7325811848983502, + "learning_rate": 6.981449628727581e-07, + "loss": 1.532, + "step": 496 + }, + { + "epoch": 0.03463052642580915, + "grad_norm": 0.7606852959640895, + "learning_rate": 6.981374827314053e-07, + "loss": 1.6605, + "step": 497 + }, + { + "epoch": 0.03470020555342647, + "grad_norm": 0.7965507881830842, + "learning_rate": 6.981299875838615e-07, + "loss": 1.7101, + "step": 498 + }, + { + "epoch": 0.03476988468104379, + "grad_norm": 0.7685993735694805, + "learning_rate": 6.981224774304859e-07, + "loss": 1.5262, + "step": 499 + }, + { + "epoch": 0.034839563808661114, + "grad_norm": 5.064186032538183, + "learning_rate": 6.981149522716382e-07, + "loss": 1.4328, + "step": 500 + }, + { + "epoch": 0.03490924293627844, + "grad_norm": 0.6818074661763434, + "learning_rate": 6.981074121076793e-07, + "loss": 1.5321, + "step": 501 + }, + { + "epoch": 0.03497892206389576, + "grad_norm": 0.7140914098818415, + "learning_rate": 6.980998569389705e-07, + "loss": 1.6084, + "step": 502 + }, + { + "epoch": 0.03504860119151308, + "grad_norm": 0.7070692585114468, + "learning_rate": 6.980922867658736e-07, + "loss": 1.5168, + "step": 503 + }, + { + "epoch": 0.035118280319130404, + "grad_norm": 0.7710204892635041, + "learning_rate": 6.980847015887516e-07, + "loss": 1.5205, + "step": 504 + }, + { + "epoch": 0.03518795944674773, + "grad_norm": 0.6981371493463051, + "learning_rate": 6.98077101407968e-07, + "loss": 1.525, + "step": 505 + }, + { + "epoch": 0.03525763857436505, + "grad_norm": 0.7402087236647792, + "learning_rate": 6.980694862238869e-07, + "loss": 1.5848, + "step": 506 + }, + { + "epoch": 0.03532731770198237, + "grad_norm": 0.7287932998192505, + "learning_rate": 6.980618560368733e-07, + "loss": 1.6185, + "step": 507 + }, + { + "epoch": 0.035396996829599695, + "grad_norm": 0.7641151729360003, + "learning_rate": 6.980542108472929e-07, + "loss": 1.5882, + "step": 508 + }, + { + "epoch": 0.03546667595721702, + "grad_norm": 0.7181253669244724, + "learning_rate": 6.98046550655512e-07, + "loss": 1.6705, + "step": 509 + }, + { + "epoch": 0.03553635508483434, + "grad_norm": 0.710146864055551, + "learning_rate": 6.980388754618978e-07, + "loss": 1.6129, + "step": 510 + }, + { + "epoch": 0.03560603421245166, + "grad_norm": 0.7452987549392524, + "learning_rate": 6.98031185266818e-07, + "loss": 1.5337, + "step": 511 + }, + { + "epoch": 0.035675713340068985, + "grad_norm": 0.7765641316168032, + "learning_rate": 6.980234800706411e-07, + "loss": 1.6418, + "step": 512 + }, + { + "epoch": 0.03574539246768631, + "grad_norm": 0.7175298447436895, + "learning_rate": 6.980157598737365e-07, + "loss": 1.5969, + "step": 513 + }, + { + "epoch": 0.03581507159530363, + "grad_norm": 0.6839168177575335, + "learning_rate": 6.98008024676474e-07, + "loss": 1.5608, + "step": 514 + }, + { + "epoch": 0.03588475072292095, + "grad_norm": 0.7179064031162559, + "learning_rate": 6.980002744792244e-07, + "loss": 1.5678, + "step": 515 + }, + { + "epoch": 0.03595442985053827, + "grad_norm": 0.7319035209158925, + "learning_rate": 6.97992509282359e-07, + "loss": 1.5515, + "step": 516 + }, + { + "epoch": 0.03602410897815559, + "grad_norm": 0.7417659390519656, + "learning_rate": 6.9798472908625e-07, + "loss": 1.5878, + "step": 517 + }, + { + "epoch": 0.03609378810577291, + "grad_norm": 0.7355893605239711, + "learning_rate": 6.979769338912703e-07, + "loss": 1.5392, + "step": 518 + }, + { + "epoch": 0.036163467233390235, + "grad_norm": 0.7046601739665954, + "learning_rate": 6.979691236977935e-07, + "loss": 1.559, + "step": 519 + }, + { + "epoch": 0.03623314636100756, + "grad_norm": 0.7097878162497628, + "learning_rate": 6.979612985061936e-07, + "loss": 1.6167, + "step": 520 + }, + { + "epoch": 0.03630282548862488, + "grad_norm": 0.7173533144523362, + "learning_rate": 6.979534583168458e-07, + "loss": 1.5876, + "step": 521 + }, + { + "epoch": 0.0363725046162422, + "grad_norm": 0.6869507707629723, + "learning_rate": 6.979456031301258e-07, + "loss": 1.5362, + "step": 522 + }, + { + "epoch": 0.036442183743859526, + "grad_norm": 0.7110753403068261, + "learning_rate": 6.9793773294641e-07, + "loss": 1.524, + "step": 523 + }, + { + "epoch": 0.03651186287147685, + "grad_norm": 0.6991914768564813, + "learning_rate": 6.979298477660757e-07, + "loss": 1.5218, + "step": 524 + }, + { + "epoch": 0.03658154199909417, + "grad_norm": 0.7184863855815552, + "learning_rate": 6.979219475895006e-07, + "loss": 1.4594, + "step": 525 + }, + { + "epoch": 0.03665122112671149, + "grad_norm": 0.6675660110790665, + "learning_rate": 6.979140324170635e-07, + "loss": 1.5326, + "step": 526 + }, + { + "epoch": 0.036720900254328816, + "grad_norm": 0.7247217590142752, + "learning_rate": 6.979061022491434e-07, + "loss": 1.5758, + "step": 527 + }, + { + "epoch": 0.03679057938194614, + "grad_norm": 0.7340681651786749, + "learning_rate": 6.978981570861205e-07, + "loss": 1.58, + "step": 528 + }, + { + "epoch": 0.03686025850956346, + "grad_norm": 0.6962545953214849, + "learning_rate": 6.978901969283756e-07, + "loss": 1.6094, + "step": 529 + }, + { + "epoch": 0.03692993763718078, + "grad_norm": 0.7616295983929137, + "learning_rate": 6.978822217762901e-07, + "loss": 1.5415, + "step": 530 + }, + { + "epoch": 0.036999616764798106, + "grad_norm": 0.6858684867940431, + "learning_rate": 6.978742316302462e-07, + "loss": 1.6065, + "step": 531 + }, + { + "epoch": 0.03706929589241543, + "grad_norm": 0.7566143279490591, + "learning_rate": 6.978662264906268e-07, + "loss": 1.6859, + "step": 532 + }, + { + "epoch": 0.03713897502003275, + "grad_norm": 0.7450351080732804, + "learning_rate": 6.978582063578154e-07, + "loss": 1.6706, + "step": 533 + }, + { + "epoch": 0.03720865414765007, + "grad_norm": 0.8200694743403997, + "learning_rate": 6.978501712321967e-07, + "loss": 1.5625, + "step": 534 + }, + { + "epoch": 0.037278333275267396, + "grad_norm": 0.7468795076646643, + "learning_rate": 6.978421211141554e-07, + "loss": 1.6703, + "step": 535 + }, + { + "epoch": 0.03734801240288472, + "grad_norm": 0.7180155939758754, + "learning_rate": 6.978340560040774e-07, + "loss": 1.5986, + "step": 536 + }, + { + "epoch": 0.03741769153050204, + "grad_norm": 0.7147435818894757, + "learning_rate": 6.978259759023493e-07, + "loss": 1.5683, + "step": 537 + }, + { + "epoch": 0.037487370658119364, + "grad_norm": 0.779891568987929, + "learning_rate": 6.978178808093581e-07, + "loss": 1.6255, + "step": 538 + }, + { + "epoch": 0.03755704978573668, + "grad_norm": 0.7618768803406499, + "learning_rate": 6.978097707254919e-07, + "loss": 1.7284, + "step": 539 + }, + { + "epoch": 0.037626728913354, + "grad_norm": 0.7298137167232285, + "learning_rate": 6.978016456511393e-07, + "loss": 1.6203, + "step": 540 + }, + { + "epoch": 0.037696408040971324, + "grad_norm": 0.7562638291509813, + "learning_rate": 6.977935055866896e-07, + "loss": 1.6245, + "step": 541 + }, + { + "epoch": 0.03776608716858865, + "grad_norm": 0.6874011048759046, + "learning_rate": 6.977853505325329e-07, + "loss": 1.4707, + "step": 542 + }, + { + "epoch": 0.03783576629620597, + "grad_norm": 0.7091168800804247, + "learning_rate": 6.977771804890601e-07, + "loss": 1.535, + "step": 543 + }, + { + "epoch": 0.03790544542382329, + "grad_norm": 0.7817069685386216, + "learning_rate": 6.97768995456663e-07, + "loss": 1.5988, + "step": 544 + }, + { + "epoch": 0.037975124551440614, + "grad_norm": 0.7041314844983713, + "learning_rate": 6.977607954357331e-07, + "loss": 1.6012, + "step": 545 + }, + { + "epoch": 0.03804480367905794, + "grad_norm": 0.7442509072892823, + "learning_rate": 6.977525804266641e-07, + "loss": 1.5457, + "step": 546 + }, + { + "epoch": 0.03811448280667526, + "grad_norm": 0.6605248898044932, + "learning_rate": 6.977443504298493e-07, + "loss": 1.5191, + "step": 547 + }, + { + "epoch": 0.03818416193429258, + "grad_norm": 0.7001973481764062, + "learning_rate": 6.977361054456831e-07, + "loss": 1.609, + "step": 548 + }, + { + "epoch": 0.038253841061909905, + "grad_norm": 0.7060005032563952, + "learning_rate": 6.977278454745608e-07, + "loss": 1.5307, + "step": 549 + }, + { + "epoch": 0.03832352018952723, + "grad_norm": 0.6902502138912355, + "learning_rate": 6.97719570516878e-07, + "loss": 1.5973, + "step": 550 + }, + { + "epoch": 0.03839319931714455, + "grad_norm": 0.7691013493985639, + "learning_rate": 6.977112805730315e-07, + "loss": 1.6391, + "step": 551 + }, + { + "epoch": 0.03846287844476187, + "grad_norm": 0.7352744642083536, + "learning_rate": 6.977029756434184e-07, + "loss": 1.6102, + "step": 552 + }, + { + "epoch": 0.038532557572379195, + "grad_norm": 0.7297849741047285, + "learning_rate": 6.976946557284367e-07, + "loss": 1.5189, + "step": 553 + }, + { + "epoch": 0.03860223669999652, + "grad_norm": 0.8050387561532055, + "learning_rate": 6.976863208284852e-07, + "loss": 1.6532, + "step": 554 + }, + { + "epoch": 0.03867191582761384, + "grad_norm": 0.7513047702526604, + "learning_rate": 6.976779709439633e-07, + "loss": 1.655, + "step": 555 + }, + { + "epoch": 0.03874159495523116, + "grad_norm": 0.7109447925987789, + "learning_rate": 6.97669606075271e-07, + "loss": 1.6917, + "step": 556 + }, + { + "epoch": 0.038811274082848485, + "grad_norm": 0.7066517914984015, + "learning_rate": 6.976612262228094e-07, + "loss": 1.548, + "step": 557 + }, + { + "epoch": 0.03888095321046581, + "grad_norm": 0.7926936156146592, + "learning_rate": 6.976528313869799e-07, + "loss": 1.6356, + "step": 558 + }, + { + "epoch": 0.03895063233808313, + "grad_norm": 0.7617651263509975, + "learning_rate": 6.976444215681848e-07, + "loss": 1.4892, + "step": 559 + }, + { + "epoch": 0.03902031146570045, + "grad_norm": 0.7338500441938097, + "learning_rate": 6.976359967668273e-07, + "loss": 1.5784, + "step": 560 + }, + { + "epoch": 0.039089990593317775, + "grad_norm": 0.7305585031017265, + "learning_rate": 6.97627556983311e-07, + "loss": 1.745, + "step": 561 + }, + { + "epoch": 0.03915966972093509, + "grad_norm": 0.712314154316217, + "learning_rate": 6.976191022180402e-07, + "loss": 1.5946, + "step": 562 + }, + { + "epoch": 0.03922934884855241, + "grad_norm": 0.6785678714420585, + "learning_rate": 6.976106324714204e-07, + "loss": 1.5513, + "step": 563 + }, + { + "epoch": 0.039299027976169736, + "grad_norm": 0.7179837146316714, + "learning_rate": 6.976021477438572e-07, + "loss": 1.5586, + "step": 564 + }, + { + "epoch": 0.03936870710378706, + "grad_norm": 0.6880995082747411, + "learning_rate": 6.975936480357574e-07, + "loss": 1.4726, + "step": 565 + }, + { + "epoch": 0.03943838623140438, + "grad_norm": 0.7490245409219959, + "learning_rate": 6.975851333475283e-07, + "loss": 1.5735, + "step": 566 + }, + { + "epoch": 0.0395080653590217, + "grad_norm": 0.7718344714659485, + "learning_rate": 6.975766036795778e-07, + "loss": 1.6744, + "step": 567 + }, + { + "epoch": 0.039577744486639026, + "grad_norm": 0.6999493242955485, + "learning_rate": 6.975680590323147e-07, + "loss": 1.5618, + "step": 568 + }, + { + "epoch": 0.03964742361425635, + "grad_norm": 0.7277339304673994, + "learning_rate": 6.975594994061485e-07, + "loss": 1.6268, + "step": 569 + }, + { + "epoch": 0.03971710274187367, + "grad_norm": 0.7224063955146929, + "learning_rate": 6.975509248014895e-07, + "loss": 1.6151, + "step": 570 + }, + { + "epoch": 0.03978678186949099, + "grad_norm": 0.7754705593906143, + "learning_rate": 6.975423352187485e-07, + "loss": 1.5736, + "step": 571 + }, + { + "epoch": 0.039856460997108316, + "grad_norm": 0.658621938146351, + "learning_rate": 6.975337306583371e-07, + "loss": 1.4863, + "step": 572 + }, + { + "epoch": 0.03992614012472564, + "grad_norm": 0.730264824789615, + "learning_rate": 6.975251111206678e-07, + "loss": 1.5592, + "step": 573 + }, + { + "epoch": 0.03999581925234296, + "grad_norm": 0.6657629477477796, + "learning_rate": 6.975164766061535e-07, + "loss": 1.3176, + "step": 574 + }, + { + "epoch": 0.04006549837996028, + "grad_norm": 0.7795968362382858, + "learning_rate": 6.975078271152082e-07, + "loss": 1.654, + "step": 575 + }, + { + "epoch": 0.040135177507577606, + "grad_norm": 0.7411465230498967, + "learning_rate": 6.974991626482462e-07, + "loss": 1.6269, + "step": 576 + }, + { + "epoch": 0.04020485663519493, + "grad_norm": 0.7291408994652216, + "learning_rate": 6.974904832056828e-07, + "loss": 1.4967, + "step": 577 + }, + { + "epoch": 0.04027453576281225, + "grad_norm": 0.6991182395286692, + "learning_rate": 6.974817887879338e-07, + "loss": 1.4528, + "step": 578 + }, + { + "epoch": 0.040344214890429574, + "grad_norm": 0.7457004375077715, + "learning_rate": 6.974730793954159e-07, + "loss": 1.586, + "step": 579 + }, + { + "epoch": 0.040413894018046896, + "grad_norm": 0.7067310028445386, + "learning_rate": 6.974643550285467e-07, + "loss": 1.638, + "step": 580 + }, + { + "epoch": 0.04048357314566422, + "grad_norm": 0.6841122043735114, + "learning_rate": 6.974556156877441e-07, + "loss": 1.5638, + "step": 581 + }, + { + "epoch": 0.04055325227328154, + "grad_norm": 0.7077425525133038, + "learning_rate": 6.974468613734269e-07, + "loss": 1.4724, + "step": 582 + }, + { + "epoch": 0.040622931400898864, + "grad_norm": 0.7009427894499272, + "learning_rate": 6.974380920860147e-07, + "loss": 1.6103, + "step": 583 + }, + { + "epoch": 0.040692610528516186, + "grad_norm": 0.7608036022452415, + "learning_rate": 6.974293078259277e-07, + "loss": 1.6457, + "step": 584 + }, + { + "epoch": 0.0407622896561335, + "grad_norm": 0.704166945656348, + "learning_rate": 6.974205085935869e-07, + "loss": 1.5455, + "step": 585 + }, + { + "epoch": 0.040831968783750824, + "grad_norm": 0.6946949545881428, + "learning_rate": 6.974116943894139e-07, + "loss": 1.666, + "step": 586 + }, + { + "epoch": 0.04090164791136815, + "grad_norm": 0.716100071019257, + "learning_rate": 6.974028652138311e-07, + "loss": 1.5197, + "step": 587 + }, + { + "epoch": 0.04097132703898547, + "grad_norm": 0.6838107620585757, + "learning_rate": 6.973940210672617e-07, + "loss": 1.4886, + "step": 588 + }, + { + "epoch": 0.04104100616660279, + "grad_norm": 0.6697472310234589, + "learning_rate": 6.973851619501295e-07, + "loss": 1.5473, + "step": 589 + }, + { + "epoch": 0.041110685294220115, + "grad_norm": 0.774401037898998, + "learning_rate": 6.973762878628589e-07, + "loss": 1.5531, + "step": 590 + }, + { + "epoch": 0.04118036442183744, + "grad_norm": 0.6962775521829606, + "learning_rate": 6.973673988058754e-07, + "loss": 1.621, + "step": 591 + }, + { + "epoch": 0.04125004354945476, + "grad_norm": 0.7465537441237275, + "learning_rate": 6.973584947796049e-07, + "loss": 1.3792, + "step": 592 + }, + { + "epoch": 0.04131972267707208, + "grad_norm": 0.7728818872843954, + "learning_rate": 6.973495757844739e-07, + "loss": 1.4883, + "step": 593 + }, + { + "epoch": 0.041389401804689405, + "grad_norm": 0.733610207501176, + "learning_rate": 6.973406418209102e-07, + "loss": 1.647, + "step": 594 + }, + { + "epoch": 0.04145908093230673, + "grad_norm": 0.8035948982021359, + "learning_rate": 6.973316928893416e-07, + "loss": 1.5647, + "step": 595 + }, + { + "epoch": 0.04152876005992405, + "grad_norm": 0.6768018321794376, + "learning_rate": 6.97322728990197e-07, + "loss": 1.4769, + "step": 596 + }, + { + "epoch": 0.04159843918754137, + "grad_norm": 0.7482483315404442, + "learning_rate": 6.973137501239061e-07, + "loss": 1.6266, + "step": 597 + }, + { + "epoch": 0.041668118315158695, + "grad_norm": 0.711191857418, + "learning_rate": 6.973047562908992e-07, + "loss": 1.4338, + "step": 598 + }, + { + "epoch": 0.04173779744277602, + "grad_norm": 0.7248513304507356, + "learning_rate": 6.972957474916072e-07, + "loss": 1.5761, + "step": 599 + }, + { + "epoch": 0.04180747657039334, + "grad_norm": 0.7389307964740751, + "learning_rate": 6.972867237264619e-07, + "loss": 1.5608, + "step": 600 + }, + { + "epoch": 0.04187715569801066, + "grad_norm": 0.7421731119566499, + "learning_rate": 6.972776849958957e-07, + "loss": 1.6273, + "step": 601 + }, + { + "epoch": 0.041946834825627985, + "grad_norm": 0.7836075175702004, + "learning_rate": 6.972686313003416e-07, + "loss": 1.594, + "step": 602 + }, + { + "epoch": 0.04201651395324531, + "grad_norm": 0.7090879903195844, + "learning_rate": 6.972595626402337e-07, + "loss": 1.4985, + "step": 603 + }, + { + "epoch": 0.04208619308086263, + "grad_norm": 0.6658000549796161, + "learning_rate": 6.972504790160064e-07, + "loss": 1.518, + "step": 604 + }, + { + "epoch": 0.04215587220847995, + "grad_norm": 0.7349695726566142, + "learning_rate": 6.972413804280953e-07, + "loss": 1.5596, + "step": 605 + }, + { + "epoch": 0.042225551336097275, + "grad_norm": 0.7483074717752025, + "learning_rate": 6.972322668769361e-07, + "loss": 1.5794, + "step": 606 + }, + { + "epoch": 0.0422952304637146, + "grad_norm": 0.7038158186966939, + "learning_rate": 6.972231383629657e-07, + "loss": 1.5117, + "step": 607 + }, + { + "epoch": 0.04236490959133191, + "grad_norm": 0.6751541816081398, + "learning_rate": 6.972139948866215e-07, + "loss": 1.5732, + "step": 608 + }, + { + "epoch": 0.042434588718949236, + "grad_norm": 0.747301063418828, + "learning_rate": 6.972048364483418e-07, + "loss": 1.5431, + "step": 609 + }, + { + "epoch": 0.04250426784656656, + "grad_norm": 0.7420324379102797, + "learning_rate": 6.971956630485652e-07, + "loss": 1.6458, + "step": 610 + }, + { + "epoch": 0.04257394697418388, + "grad_norm": 0.6698738775177508, + "learning_rate": 6.971864746877316e-07, + "loss": 1.5838, + "step": 611 + }, + { + "epoch": 0.0426436261018012, + "grad_norm": 0.7080826823481463, + "learning_rate": 6.971772713662812e-07, + "loss": 1.5728, + "step": 612 + }, + { + "epoch": 0.042713305229418526, + "grad_norm": 0.7130688284022854, + "learning_rate": 6.971680530846551e-07, + "loss": 1.6254, + "step": 613 + }, + { + "epoch": 0.04278298435703585, + "grad_norm": 0.6664129156822504, + "learning_rate": 6.971588198432952e-07, + "loss": 1.4956, + "step": 614 + }, + { + "epoch": 0.04285266348465317, + "grad_norm": 0.7132385041089505, + "learning_rate": 6.971495716426435e-07, + "loss": 1.5515, + "step": 615 + }, + { + "epoch": 0.04292234261227049, + "grad_norm": 0.73710904278707, + "learning_rate": 6.971403084831436e-07, + "loss": 1.6063, + "step": 616 + }, + { + "epoch": 0.042992021739887816, + "grad_norm": 0.8132005846012044, + "learning_rate": 6.971310303652395e-07, + "loss": 1.6231, + "step": 617 + }, + { + "epoch": 0.04306170086750514, + "grad_norm": 0.6973241807952938, + "learning_rate": 6.971217372893753e-07, + "loss": 1.6154, + "step": 618 + }, + { + "epoch": 0.04313137999512246, + "grad_norm": 2.8190614475050513, + "learning_rate": 6.971124292559969e-07, + "loss": 1.6138, + "step": 619 + }, + { + "epoch": 0.043201059122739784, + "grad_norm": 0.7236203087033882, + "learning_rate": 6.971031062655502e-07, + "loss": 1.46, + "step": 620 + }, + { + "epoch": 0.043270738250357106, + "grad_norm": 0.7721528246703844, + "learning_rate": 6.970937683184816e-07, + "loss": 1.651, + "step": 621 + }, + { + "epoch": 0.04334041737797443, + "grad_norm": 0.7207721016116038, + "learning_rate": 6.970844154152392e-07, + "loss": 1.6102, + "step": 622 + }, + { + "epoch": 0.04341009650559175, + "grad_norm": 0.7470132174851035, + "learning_rate": 6.970750475562709e-07, + "loss": 1.5735, + "step": 623 + }, + { + "epoch": 0.043479775633209074, + "grad_norm": 0.682052163775226, + "learning_rate": 6.970656647420255e-07, + "loss": 1.5302, + "step": 624 + }, + { + "epoch": 0.043549454760826396, + "grad_norm": 0.7618764660275008, + "learning_rate": 6.970562669729528e-07, + "loss": 1.6138, + "step": 625 + }, + { + "epoch": 0.04361913388844372, + "grad_norm": 0.6772064059244419, + "learning_rate": 6.970468542495033e-07, + "loss": 1.4087, + "step": 626 + }, + { + "epoch": 0.04368881301606104, + "grad_norm": 0.7376677084023966, + "learning_rate": 6.970374265721277e-07, + "loss": 1.5959, + "step": 627 + }, + { + "epoch": 0.043758492143678364, + "grad_norm": 0.8303204432724274, + "learning_rate": 6.970279839412782e-07, + "loss": 1.6502, + "step": 628 + }, + { + "epoch": 0.043828171271295686, + "grad_norm": 0.7786738881941985, + "learning_rate": 6.970185263574071e-07, + "loss": 1.6448, + "step": 629 + }, + { + "epoch": 0.04389785039891301, + "grad_norm": 0.7128585750577182, + "learning_rate": 6.970090538209676e-07, + "loss": 1.44, + "step": 630 + }, + { + "epoch": 0.043967529526530325, + "grad_norm": 0.6627090163476095, + "learning_rate": 6.969995663324138e-07, + "loss": 1.3244, + "step": 631 + }, + { + "epoch": 0.04403720865414765, + "grad_norm": 0.7745287325391855, + "learning_rate": 6.969900638922e-07, + "loss": 1.7033, + "step": 632 + }, + { + "epoch": 0.04410688778176497, + "grad_norm": 0.733220507077979, + "learning_rate": 6.969805465007822e-07, + "loss": 1.6772, + "step": 633 + }, + { + "epoch": 0.04417656690938229, + "grad_norm": 0.6960784033598787, + "learning_rate": 6.969710141586159e-07, + "loss": 1.5492, + "step": 634 + }, + { + "epoch": 0.044246246036999615, + "grad_norm": 0.6881665503401039, + "learning_rate": 6.96961466866158e-07, + "loss": 1.5384, + "step": 635 + }, + { + "epoch": 0.04431592516461694, + "grad_norm": 0.7093423611533153, + "learning_rate": 6.969519046238665e-07, + "loss": 1.5778, + "step": 636 + }, + { + "epoch": 0.04438560429223426, + "grad_norm": 0.6950788938005011, + "learning_rate": 6.969423274321992e-07, + "loss": 1.4672, + "step": 637 + }, + { + "epoch": 0.04445528341985158, + "grad_norm": 0.6971933321048744, + "learning_rate": 6.969327352916151e-07, + "loss": 1.5282, + "step": 638 + }, + { + "epoch": 0.044524962547468905, + "grad_norm": 0.6700704096045089, + "learning_rate": 6.96923128202574e-07, + "loss": 1.4895, + "step": 639 + }, + { + "epoch": 0.04459464167508623, + "grad_norm": 0.6942860808996856, + "learning_rate": 6.969135061655361e-07, + "loss": 1.4508, + "step": 640 + }, + { + "epoch": 0.04466432080270355, + "grad_norm": 0.7804476942861223, + "learning_rate": 6.969038691809628e-07, + "loss": 1.509, + "step": 641 + }, + { + "epoch": 0.04473399993032087, + "grad_norm": 0.6853905689405291, + "learning_rate": 6.968942172493156e-07, + "loss": 1.4684, + "step": 642 + }, + { + "epoch": 0.044803679057938195, + "grad_norm": 0.6764348433542908, + "learning_rate": 6.968845503710572e-07, + "loss": 1.5356, + "step": 643 + }, + { + "epoch": 0.04487335818555552, + "grad_norm": 0.6773570660436679, + "learning_rate": 6.96874868546651e-07, + "loss": 1.5618, + "step": 644 + }, + { + "epoch": 0.04494303731317284, + "grad_norm": 0.7622466312475294, + "learning_rate": 6.968651717765608e-07, + "loss": 1.5596, + "step": 645 + }, + { + "epoch": 0.04501271644079016, + "grad_norm": 0.7254213570087428, + "learning_rate": 6.968554600612512e-07, + "loss": 1.5213, + "step": 646 + }, + { + "epoch": 0.045082395568407485, + "grad_norm": 0.6885774067075373, + "learning_rate": 6.968457334011879e-07, + "loss": 1.5261, + "step": 647 + }, + { + "epoch": 0.04515207469602481, + "grad_norm": 0.7101552502470694, + "learning_rate": 6.968359917968368e-07, + "loss": 1.6694, + "step": 648 + }, + { + "epoch": 0.04522175382364213, + "grad_norm": 0.7369871559845563, + "learning_rate": 6.968262352486649e-07, + "loss": 1.5304, + "step": 649 + }, + { + "epoch": 0.04529143295125945, + "grad_norm": 0.7875133371881571, + "learning_rate": 6.968164637571393e-07, + "loss": 1.4934, + "step": 650 + }, + { + "epoch": 0.045361112078876775, + "grad_norm": 0.7019279683983147, + "learning_rate": 6.968066773227289e-07, + "loss": 1.5468, + "step": 651 + }, + { + "epoch": 0.0454307912064941, + "grad_norm": 0.7341904116170085, + "learning_rate": 6.967968759459023e-07, + "loss": 1.5338, + "step": 652 + }, + { + "epoch": 0.04550047033411142, + "grad_norm": 0.6810202405701805, + "learning_rate": 6.967870596271292e-07, + "loss": 1.6357, + "step": 653 + }, + { + "epoch": 0.045570149461728736, + "grad_norm": 0.699448200025997, + "learning_rate": 6.967772283668803e-07, + "loss": 1.5453, + "step": 654 + }, + { + "epoch": 0.04563982858934606, + "grad_norm": 0.7158382501901476, + "learning_rate": 6.967673821656265e-07, + "loss": 1.5505, + "step": 655 + }, + { + "epoch": 0.04570950771696338, + "grad_norm": 0.6535608334622804, + "learning_rate": 6.967575210238395e-07, + "loss": 1.5453, + "step": 656 + }, + { + "epoch": 0.045779186844580703, + "grad_norm": 0.7080634763231599, + "learning_rate": 6.967476449419924e-07, + "loss": 1.5479, + "step": 657 + }, + { + "epoch": 0.045848865972198026, + "grad_norm": 0.7659320955527936, + "learning_rate": 6.967377539205579e-07, + "loss": 1.6749, + "step": 658 + }, + { + "epoch": 0.04591854509981535, + "grad_norm": 0.6686160543798587, + "learning_rate": 6.967278479600104e-07, + "loss": 1.4683, + "step": 659 + }, + { + "epoch": 0.04598822422743267, + "grad_norm": 0.7537119319439501, + "learning_rate": 6.967179270608243e-07, + "loss": 1.45, + "step": 660 + }, + { + "epoch": 0.046057903355049994, + "grad_norm": 0.7349118447650984, + "learning_rate": 6.967079912234754e-07, + "loss": 1.595, + "step": 661 + }, + { + "epoch": 0.046127582482667316, + "grad_norm": 0.7293110394952392, + "learning_rate": 6.966980404484395e-07, + "loss": 1.4752, + "step": 662 + }, + { + "epoch": 0.04619726161028464, + "grad_norm": 0.7387902011995563, + "learning_rate": 6.966880747361936e-07, + "loss": 1.4697, + "step": 663 + }, + { + "epoch": 0.04626694073790196, + "grad_norm": 0.6765249539707564, + "learning_rate": 6.966780940872153e-07, + "loss": 1.4381, + "step": 664 + }, + { + "epoch": 0.046336619865519284, + "grad_norm": 0.689217791498506, + "learning_rate": 6.966680985019828e-07, + "loss": 1.4685, + "step": 665 + }, + { + "epoch": 0.046406298993136606, + "grad_norm": 0.6841331183811533, + "learning_rate": 6.966580879809752e-07, + "loss": 1.4852, + "step": 666 + }, + { + "epoch": 0.04647597812075393, + "grad_norm": 0.743925290927153, + "learning_rate": 6.966480625246722e-07, + "loss": 1.5746, + "step": 667 + }, + { + "epoch": 0.04654565724837125, + "grad_norm": 0.6890299457883376, + "learning_rate": 6.966380221335544e-07, + "loss": 1.6132, + "step": 668 + }, + { + "epoch": 0.046615336375988574, + "grad_norm": 0.7300586622886946, + "learning_rate": 6.966279668081026e-07, + "loss": 1.5986, + "step": 669 + }, + { + "epoch": 0.046685015503605896, + "grad_norm": 0.7367747555470463, + "learning_rate": 6.966178965487989e-07, + "loss": 1.6231, + "step": 670 + }, + { + "epoch": 0.04675469463122322, + "grad_norm": 0.6836238330355955, + "learning_rate": 6.96607811356126e-07, + "loss": 1.576, + "step": 671 + }, + { + "epoch": 0.04682437375884054, + "grad_norm": 0.7033258443880477, + "learning_rate": 6.96597711230567e-07, + "loss": 1.6208, + "step": 672 + }, + { + "epoch": 0.046894052886457864, + "grad_norm": 0.8339698614381776, + "learning_rate": 6.96587596172606e-07, + "loss": 1.5455, + "step": 673 + }, + { + "epoch": 0.04696373201407519, + "grad_norm": 0.7085764029952359, + "learning_rate": 6.965774661827277e-07, + "loss": 1.4264, + "step": 674 + }, + { + "epoch": 0.04703341114169251, + "grad_norm": 0.7646037417894928, + "learning_rate": 6.965673212614174e-07, + "loss": 1.4358, + "step": 675 + }, + { + "epoch": 0.04710309026930983, + "grad_norm": 0.7532185100268795, + "learning_rate": 6.965571614091616e-07, + "loss": 1.6341, + "step": 676 + }, + { + "epoch": 0.04717276939692715, + "grad_norm": 0.6685706703379944, + "learning_rate": 6.96546986626447e-07, + "loss": 1.546, + "step": 677 + }, + { + "epoch": 0.04724244852454447, + "grad_norm": 0.680236955536486, + "learning_rate": 6.965367969137611e-07, + "loss": 1.4986, + "step": 678 + }, + { + "epoch": 0.04731212765216179, + "grad_norm": 0.6986156128855948, + "learning_rate": 6.965265922715925e-07, + "loss": 1.5194, + "step": 679 + }, + { + "epoch": 0.047381806779779115, + "grad_norm": 0.7329077948281687, + "learning_rate": 6.9651637270043e-07, + "loss": 1.5835, + "step": 680 + }, + { + "epoch": 0.04745148590739644, + "grad_norm": 0.67737004024086, + "learning_rate": 6.965061382007632e-07, + "loss": 1.5043, + "step": 681 + }, + { + "epoch": 0.04752116503501376, + "grad_norm": 0.7124756190212052, + "learning_rate": 6.964958887730829e-07, + "loss": 1.5287, + "step": 682 + }, + { + "epoch": 0.04759084416263108, + "grad_norm": 0.7089711249978385, + "learning_rate": 6.964856244178801e-07, + "loss": 1.5851, + "step": 683 + }, + { + "epoch": 0.047660523290248405, + "grad_norm": 0.6927438151246575, + "learning_rate": 6.964753451356467e-07, + "loss": 1.5052, + "step": 684 + }, + { + "epoch": 0.04773020241786573, + "grad_norm": 0.7749472284337433, + "learning_rate": 6.964650509268753e-07, + "loss": 1.6473, + "step": 685 + }, + { + "epoch": 0.04779988154548305, + "grad_norm": 0.6963609256556942, + "learning_rate": 6.964547417920593e-07, + "loss": 1.6013, + "step": 686 + }, + { + "epoch": 0.04786956067310037, + "grad_norm": 0.687383111340697, + "learning_rate": 6.964444177316926e-07, + "loss": 1.5295, + "step": 687 + }, + { + "epoch": 0.047939239800717695, + "grad_norm": 0.7180160100897451, + "learning_rate": 6.964340787462701e-07, + "loss": 1.4711, + "step": 688 + }, + { + "epoch": 0.04800891892833502, + "grad_norm": 0.7590607641255273, + "learning_rate": 6.964237248362871e-07, + "loss": 1.5769, + "step": 689 + }, + { + "epoch": 0.04807859805595234, + "grad_norm": 0.8004088368747236, + "learning_rate": 6.9641335600224e-07, + "loss": 1.5545, + "step": 690 + }, + { + "epoch": 0.04814827718356966, + "grad_norm": 0.7273413263118548, + "learning_rate": 6.964029722446253e-07, + "loss": 1.6128, + "step": 691 + }, + { + "epoch": 0.048217956311186985, + "grad_norm": 0.7066776812890259, + "learning_rate": 6.963925735639411e-07, + "loss": 1.5317, + "step": 692 + }, + { + "epoch": 0.04828763543880431, + "grad_norm": 0.7015857397206559, + "learning_rate": 6.963821599606854e-07, + "loss": 1.5972, + "step": 693 + }, + { + "epoch": 0.04835731456642163, + "grad_norm": 0.7232812425018523, + "learning_rate": 6.963717314353574e-07, + "loss": 1.5294, + "step": 694 + }, + { + "epoch": 0.04842699369403895, + "grad_norm": 0.7243849048748213, + "learning_rate": 6.963612879884567e-07, + "loss": 1.556, + "step": 695 + }, + { + "epoch": 0.048496672821656275, + "grad_norm": 0.7359746952905802, + "learning_rate": 6.96350829620484e-07, + "loss": 1.546, + "step": 696 + }, + { + "epoch": 0.0485663519492736, + "grad_norm": 0.7122154316120477, + "learning_rate": 6.963403563319402e-07, + "loss": 1.733, + "step": 697 + }, + { + "epoch": 0.04863603107689092, + "grad_norm": 0.7055779006893984, + "learning_rate": 6.963298681233274e-07, + "loss": 1.6171, + "step": 698 + }, + { + "epoch": 0.04870571020450824, + "grad_norm": 0.716296197133196, + "learning_rate": 6.963193649951483e-07, + "loss": 1.752, + "step": 699 + }, + { + "epoch": 0.04877538933212556, + "grad_norm": 0.6713758234296247, + "learning_rate": 6.96308846947906e-07, + "loss": 1.4199, + "step": 700 + }, + { + "epoch": 0.04884506845974288, + "grad_norm": 0.7743784620060852, + "learning_rate": 6.962983139821047e-07, + "loss": 1.4771, + "step": 701 + }, + { + "epoch": 0.048914747587360204, + "grad_norm": 0.6641119216158364, + "learning_rate": 6.96287766098249e-07, + "loss": 1.5956, + "step": 702 + }, + { + "epoch": 0.048984426714977526, + "grad_norm": 0.6558222609838982, + "learning_rate": 6.962772032968446e-07, + "loss": 1.4149, + "step": 703 + }, + { + "epoch": 0.04905410584259485, + "grad_norm": 0.7450866187835455, + "learning_rate": 6.962666255783975e-07, + "loss": 1.5214, + "step": 704 + }, + { + "epoch": 0.04912378497021217, + "grad_norm": 0.7446602054101235, + "learning_rate": 6.962560329434148e-07, + "loss": 1.5909, + "step": 705 + }, + { + "epoch": 0.049193464097829494, + "grad_norm": 0.6912223067403324, + "learning_rate": 6.962454253924038e-07, + "loss": 1.5803, + "step": 706 + }, + { + "epoch": 0.049263143225446816, + "grad_norm": 0.7497854047755567, + "learning_rate": 6.962348029258732e-07, + "loss": 1.6816, + "step": 707 + }, + { + "epoch": 0.04933282235306414, + "grad_norm": 0.6944629046945672, + "learning_rate": 6.96224165544332e-07, + "loss": 1.4664, + "step": 708 + }, + { + "epoch": 0.04940250148068146, + "grad_norm": 0.6968358758121764, + "learning_rate": 6.962135132482896e-07, + "loss": 1.5491, + "step": 709 + }, + { + "epoch": 0.049472180608298784, + "grad_norm": 0.6771949433189489, + "learning_rate": 6.962028460382568e-07, + "loss": 1.4434, + "step": 710 + }, + { + "epoch": 0.049541859735916106, + "grad_norm": 0.7158393842253569, + "learning_rate": 6.961921639147448e-07, + "loss": 1.6615, + "step": 711 + }, + { + "epoch": 0.04961153886353343, + "grad_norm": 0.6926218608736993, + "learning_rate": 6.961814668782655e-07, + "loss": 1.551, + "step": 712 + }, + { + "epoch": 0.04968121799115075, + "grad_norm": 0.7354027473157468, + "learning_rate": 6.961707549293313e-07, + "loss": 1.5184, + "step": 713 + }, + { + "epoch": 0.049750897118768074, + "grad_norm": 0.6540498769668062, + "learning_rate": 6.961600280684558e-07, + "loss": 1.4642, + "step": 714 + }, + { + "epoch": 0.0498205762463854, + "grad_norm": 0.7328051651446971, + "learning_rate": 6.961492862961528e-07, + "loss": 1.5514, + "step": 715 + }, + { + "epoch": 0.04989025537400272, + "grad_norm": 0.7123550388724823, + "learning_rate": 6.961385296129375e-07, + "loss": 1.6047, + "step": 716 + }, + { + "epoch": 0.04995993450162004, + "grad_norm": 0.7147651353416852, + "learning_rate": 6.961277580193249e-07, + "loss": 1.6656, + "step": 717 + }, + { + "epoch": 0.050029613629237364, + "grad_norm": 0.6906496045277001, + "learning_rate": 6.961169715158317e-07, + "loss": 1.5934, + "step": 718 + }, + { + "epoch": 0.05009929275685469, + "grad_norm": 0.7541044654754127, + "learning_rate": 6.961061701029741e-07, + "loss": 1.4913, + "step": 719 + }, + { + "epoch": 0.05016897188447201, + "grad_norm": 0.7540942860771993, + "learning_rate": 6.960953537812703e-07, + "loss": 1.5373, + "step": 720 + }, + { + "epoch": 0.05023865101208933, + "grad_norm": 0.6781331883086441, + "learning_rate": 6.960845225512386e-07, + "loss": 1.5436, + "step": 721 + }, + { + "epoch": 0.050308330139706654, + "grad_norm": 0.8470452786448093, + "learning_rate": 6.960736764133978e-07, + "loss": 1.6208, + "step": 722 + }, + { + "epoch": 0.05037800926732397, + "grad_norm": 0.6760410770524181, + "learning_rate": 6.960628153682679e-07, + "loss": 1.4621, + "step": 723 + }, + { + "epoch": 0.05044768839494129, + "grad_norm": 0.7490727821128771, + "learning_rate": 6.960519394163692e-07, + "loss": 1.4257, + "step": 724 + }, + { + "epoch": 0.050517367522558615, + "grad_norm": 0.6857827107854004, + "learning_rate": 6.96041048558223e-07, + "loss": 1.6527, + "step": 725 + }, + { + "epoch": 0.05058704665017594, + "grad_norm": 0.7032073154704966, + "learning_rate": 6.960301427943513e-07, + "loss": 1.6024, + "step": 726 + }, + { + "epoch": 0.05065672577779326, + "grad_norm": 0.6849058234749557, + "learning_rate": 6.960192221252765e-07, + "loss": 1.4722, + "step": 727 + }, + { + "epoch": 0.05072640490541058, + "grad_norm": 0.7322091820332296, + "learning_rate": 6.960082865515221e-07, + "loss": 1.6492, + "step": 728 + }, + { + "epoch": 0.050796084033027905, + "grad_norm": 0.7224516190286429, + "learning_rate": 6.959973360736122e-07, + "loss": 1.6839, + "step": 729 + }, + { + "epoch": 0.05086576316064523, + "grad_norm": 0.686954801078055, + "learning_rate": 6.959863706920713e-07, + "loss": 1.5376, + "step": 730 + }, + { + "epoch": 0.05093544228826255, + "grad_norm": 0.6862780679407999, + "learning_rate": 6.959753904074253e-07, + "loss": 1.5023, + "step": 731 + }, + { + "epoch": 0.05100512141587987, + "grad_norm": 0.721010317984146, + "learning_rate": 6.959643952202001e-07, + "loss": 1.5141, + "step": 732 + }, + { + "epoch": 0.051074800543497195, + "grad_norm": 0.6967847500688505, + "learning_rate": 6.959533851309226e-07, + "loss": 1.4556, + "step": 733 + }, + { + "epoch": 0.05114447967111452, + "grad_norm": 0.7489626901360203, + "learning_rate": 6.959423601401205e-07, + "loss": 1.533, + "step": 734 + }, + { + "epoch": 0.05121415879873184, + "grad_norm": 0.7850799405099177, + "learning_rate": 6.959313202483222e-07, + "loss": 1.5717, + "step": 735 + }, + { + "epoch": 0.05128383792634916, + "grad_norm": 0.6962064304868032, + "learning_rate": 6.959202654560567e-07, + "loss": 1.4927, + "step": 736 + }, + { + "epoch": 0.051353517053966485, + "grad_norm": 0.6814157744096807, + "learning_rate": 6.959091957638539e-07, + "loss": 1.5426, + "step": 737 + }, + { + "epoch": 0.05142319618158381, + "grad_norm": 0.7046197816079532, + "learning_rate": 6.958981111722439e-07, + "loss": 1.5478, + "step": 738 + }, + { + "epoch": 0.05149287530920113, + "grad_norm": 0.730215604464463, + "learning_rate": 6.958870116817583e-07, + "loss": 1.5133, + "step": 739 + }, + { + "epoch": 0.05156255443681845, + "grad_norm": 0.7009966531083134, + "learning_rate": 6.958758972929289e-07, + "loss": 1.5814, + "step": 740 + }, + { + "epoch": 0.051632233564435775, + "grad_norm": 0.7164763075684121, + "learning_rate": 6.958647680062882e-07, + "loss": 1.54, + "step": 741 + }, + { + "epoch": 0.0517019126920531, + "grad_norm": 0.6961036208661276, + "learning_rate": 6.958536238223697e-07, + "loss": 1.5408, + "step": 742 + }, + { + "epoch": 0.05177159181967042, + "grad_norm": 0.7736476120671665, + "learning_rate": 6.958424647417074e-07, + "loss": 1.633, + "step": 743 + }, + { + "epoch": 0.05184127094728774, + "grad_norm": 0.7260767598944713, + "learning_rate": 6.958312907648358e-07, + "loss": 1.4184, + "step": 744 + }, + { + "epoch": 0.051910950074905066, + "grad_norm": 0.7292566831371331, + "learning_rate": 6.958201018922908e-07, + "loss": 1.4852, + "step": 745 + }, + { + "epoch": 0.05198062920252238, + "grad_norm": 0.6826555733917109, + "learning_rate": 6.958088981246085e-07, + "loss": 1.4976, + "step": 746 + }, + { + "epoch": 0.052050308330139704, + "grad_norm": 0.8057138550101121, + "learning_rate": 6.957976794623257e-07, + "loss": 1.6662, + "step": 747 + }, + { + "epoch": 0.052119987457757026, + "grad_norm": 0.7383595107784787, + "learning_rate": 6.9578644590598e-07, + "loss": 1.6895, + "step": 748 + }, + { + "epoch": 0.05218966658537435, + "grad_norm": 0.7623169165234596, + "learning_rate": 6.957751974561098e-07, + "loss": 1.5831, + "step": 749 + }, + { + "epoch": 0.05225934571299167, + "grad_norm": 0.7010872976902116, + "learning_rate": 6.95763934113254e-07, + "loss": 1.6176, + "step": 750 + }, + { + "epoch": 0.052329024840608994, + "grad_norm": 0.7652887108312698, + "learning_rate": 6.957526558779526e-07, + "loss": 1.5928, + "step": 751 + }, + { + "epoch": 0.052398703968226316, + "grad_norm": 0.8296288937742872, + "learning_rate": 6.95741362750746e-07, + "loss": 1.5899, + "step": 752 + }, + { + "epoch": 0.05246838309584364, + "grad_norm": 0.7128647360486586, + "learning_rate": 6.957300547321753e-07, + "loss": 1.55, + "step": 753 + }, + { + "epoch": 0.05253806222346096, + "grad_norm": 0.7111814164131637, + "learning_rate": 6.957187318227823e-07, + "loss": 1.7715, + "step": 754 + }, + { + "epoch": 0.052607741351078284, + "grad_norm": 0.7581371563133963, + "learning_rate": 6.9570739402311e-07, + "loss": 1.621, + "step": 755 + }, + { + "epoch": 0.05267742047869561, + "grad_norm": 0.7320911081083655, + "learning_rate": 6.956960413337015e-07, + "loss": 1.6028, + "step": 756 + }, + { + "epoch": 0.05274709960631293, + "grad_norm": 0.6811468195966736, + "learning_rate": 6.956846737551008e-07, + "loss": 1.4258, + "step": 757 + }, + { + "epoch": 0.05281677873393025, + "grad_norm": 0.7558283632455667, + "learning_rate": 6.956732912878528e-07, + "loss": 1.6908, + "step": 758 + }, + { + "epoch": 0.052886457861547574, + "grad_norm": 0.7401585909526207, + "learning_rate": 6.956618939325027e-07, + "loss": 1.5859, + "step": 759 + }, + { + "epoch": 0.0529561369891649, + "grad_norm": 0.6869688342909643, + "learning_rate": 6.95650481689597e-07, + "loss": 1.5995, + "step": 760 + }, + { + "epoch": 0.05302581611678222, + "grad_norm": 0.732390242062926, + "learning_rate": 6.956390545596824e-07, + "loss": 1.5717, + "step": 761 + }, + { + "epoch": 0.05309549524439954, + "grad_norm": 0.7217401642738522, + "learning_rate": 6.956276125433066e-07, + "loss": 1.6592, + "step": 762 + }, + { + "epoch": 0.053165174372016864, + "grad_norm": 0.7026097108747728, + "learning_rate": 6.956161556410179e-07, + "loss": 1.4727, + "step": 763 + }, + { + "epoch": 0.05323485349963419, + "grad_norm": 0.7104570430746965, + "learning_rate": 6.956046838533654e-07, + "loss": 1.548, + "step": 764 + }, + { + "epoch": 0.05330453262725151, + "grad_norm": 0.6794024019476335, + "learning_rate": 6.955931971808987e-07, + "loss": 1.3615, + "step": 765 + }, + { + "epoch": 0.05337421175486883, + "grad_norm": 0.7494295020279464, + "learning_rate": 6.955816956241684e-07, + "loss": 1.5749, + "step": 766 + }, + { + "epoch": 0.053443890882486154, + "grad_norm": 0.6688709082207913, + "learning_rate": 6.955701791837256e-07, + "loss": 1.5078, + "step": 767 + }, + { + "epoch": 0.05351357001010348, + "grad_norm": 0.710494094903225, + "learning_rate": 6.955586478601222e-07, + "loss": 1.4936, + "step": 768 + }, + { + "epoch": 0.05358324913772079, + "grad_norm": 0.711894193384377, + "learning_rate": 6.955471016539109e-07, + "loss": 1.563, + "step": 769 + }, + { + "epoch": 0.053652928265338115, + "grad_norm": 0.7323768349826295, + "learning_rate": 6.95535540565645e-07, + "loss": 1.5585, + "step": 770 + }, + { + "epoch": 0.05372260739295544, + "grad_norm": 0.7329434747907027, + "learning_rate": 6.955239645958784e-07, + "loss": 1.617, + "step": 771 + }, + { + "epoch": 0.05379228652057276, + "grad_norm": 0.748227642697152, + "learning_rate": 6.95512373745166e-07, + "loss": 1.5454, + "step": 772 + }, + { + "epoch": 0.05386196564819008, + "grad_norm": 0.7328425554339898, + "learning_rate": 6.95500768014063e-07, + "loss": 1.5224, + "step": 773 + }, + { + "epoch": 0.053931644775807405, + "grad_norm": 0.7466917176783684, + "learning_rate": 6.954891474031259e-07, + "loss": 1.5347, + "step": 774 + }, + { + "epoch": 0.05400132390342473, + "grad_norm": 0.7585007520137389, + "learning_rate": 6.954775119129114e-07, + "loss": 1.7438, + "step": 775 + }, + { + "epoch": 0.05407100303104205, + "grad_norm": 0.7439885599579423, + "learning_rate": 6.95465861543977e-07, + "loss": 1.6233, + "step": 776 + }, + { + "epoch": 0.05414068215865937, + "grad_norm": 0.6937689954958782, + "learning_rate": 6.954541962968813e-07, + "loss": 1.5569, + "step": 777 + }, + { + "epoch": 0.054210361286276695, + "grad_norm": 0.6808608820879647, + "learning_rate": 6.954425161721832e-07, + "loss": 1.5482, + "step": 778 + }, + { + "epoch": 0.05428004041389402, + "grad_norm": 0.7549169966464018, + "learning_rate": 6.954308211704422e-07, + "loss": 1.5517, + "step": 779 + }, + { + "epoch": 0.05434971954151134, + "grad_norm": 0.6734510329316952, + "learning_rate": 6.954191112922192e-07, + "loss": 1.5492, + "step": 780 + }, + { + "epoch": 0.05441939866912866, + "grad_norm": 0.6877325247570651, + "learning_rate": 6.954073865380749e-07, + "loss": 1.4862, + "step": 781 + }, + { + "epoch": 0.054489077796745986, + "grad_norm": 0.6788147898970631, + "learning_rate": 6.953956469085715e-07, + "loss": 1.5726, + "step": 782 + }, + { + "epoch": 0.05455875692436331, + "grad_norm": 0.7305927316424901, + "learning_rate": 6.953838924042714e-07, + "loss": 1.4975, + "step": 783 + }, + { + "epoch": 0.05462843605198063, + "grad_norm": 0.7132269277569465, + "learning_rate": 6.95372123025738e-07, + "loss": 1.4406, + "step": 784 + }, + { + "epoch": 0.05469811517959795, + "grad_norm": 0.7020412658784635, + "learning_rate": 6.953603387735353e-07, + "loss": 1.587, + "step": 785 + }, + { + "epoch": 0.054767794307215276, + "grad_norm": 0.7528096373864551, + "learning_rate": 6.953485396482281e-07, + "loss": 1.601, + "step": 786 + }, + { + "epoch": 0.0548374734348326, + "grad_norm": 0.6512215101888548, + "learning_rate": 6.953367256503816e-07, + "loss": 1.5033, + "step": 787 + }, + { + "epoch": 0.05490715256244992, + "grad_norm": 0.6788130061609902, + "learning_rate": 6.953248967805621e-07, + "loss": 1.4305, + "step": 788 + }, + { + "epoch": 0.05497683169006724, + "grad_norm": 0.7446892987199926, + "learning_rate": 6.953130530393365e-07, + "loss": 1.7707, + "step": 789 + }, + { + "epoch": 0.055046510817684566, + "grad_norm": 0.8206689971928751, + "learning_rate": 6.953011944272724e-07, + "loss": 1.697, + "step": 790 + }, + { + "epoch": 0.05511618994530188, + "grad_norm": 0.7121618215215683, + "learning_rate": 6.952893209449378e-07, + "loss": 1.5926, + "step": 791 + }, + { + "epoch": 0.055185869072919204, + "grad_norm": 0.6684685055821509, + "learning_rate": 6.952774325929022e-07, + "loss": 1.5346, + "step": 792 + }, + { + "epoch": 0.055255548200536526, + "grad_norm": 0.7876441568581036, + "learning_rate": 6.95265529371735e-07, + "loss": 1.6429, + "step": 793 + }, + { + "epoch": 0.05532522732815385, + "grad_norm": 0.6966735167203888, + "learning_rate": 6.952536112820066e-07, + "loss": 1.5847, + "step": 794 + }, + { + "epoch": 0.05539490645577117, + "grad_norm": 0.6584961268883045, + "learning_rate": 6.952416783242882e-07, + "loss": 1.4795, + "step": 795 + }, + { + "epoch": 0.055464585583388494, + "grad_norm": 0.8031283060641858, + "learning_rate": 6.952297304991516e-07, + "loss": 1.7066, + "step": 796 + }, + { + "epoch": 0.05553426471100582, + "grad_norm": 0.7839285381689913, + "learning_rate": 6.952177678071696e-07, + "loss": 1.7255, + "step": 797 + }, + { + "epoch": 0.05560394383862314, + "grad_norm": 0.7693006460738586, + "learning_rate": 6.952057902489152e-07, + "loss": 1.6596, + "step": 798 + }, + { + "epoch": 0.05567362296624046, + "grad_norm": 0.722826042731386, + "learning_rate": 6.951937978249624e-07, + "loss": 1.5677, + "step": 799 + }, + { + "epoch": 0.055743302093857784, + "grad_norm": 0.7117854955696307, + "learning_rate": 6.951817905358861e-07, + "loss": 1.5712, + "step": 800 + }, + { + "epoch": 0.05581298122147511, + "grad_norm": 0.7873681567228429, + "learning_rate": 6.951697683822617e-07, + "loss": 1.5217, + "step": 801 + }, + { + "epoch": 0.05588266034909243, + "grad_norm": 0.6990444943511518, + "learning_rate": 6.951577313646651e-07, + "loss": 1.4714, + "step": 802 + }, + { + "epoch": 0.05595233947670975, + "grad_norm": 0.7087438008306041, + "learning_rate": 6.951456794836733e-07, + "loss": 1.5843, + "step": 803 + }, + { + "epoch": 0.056022018604327074, + "grad_norm": 0.7466712931823629, + "learning_rate": 6.951336127398638e-07, + "loss": 1.5316, + "step": 804 + }, + { + "epoch": 0.0560916977319444, + "grad_norm": 0.7653458202685157, + "learning_rate": 6.951215311338148e-07, + "loss": 1.6453, + "step": 805 + }, + { + "epoch": 0.05616137685956172, + "grad_norm": 0.7236559613144168, + "learning_rate": 6.951094346661055e-07, + "loss": 1.6199, + "step": 806 + }, + { + "epoch": 0.05623105598717904, + "grad_norm": 0.6944459703744715, + "learning_rate": 6.950973233373155e-07, + "loss": 1.5263, + "step": 807 + }, + { + "epoch": 0.056300735114796364, + "grad_norm": 0.7816944407503731, + "learning_rate": 6.95085197148025e-07, + "loss": 1.6126, + "step": 808 + }, + { + "epoch": 0.05637041424241369, + "grad_norm": 0.7124399093435471, + "learning_rate": 6.950730560988153e-07, + "loss": 1.5162, + "step": 809 + }, + { + "epoch": 0.05644009337003101, + "grad_norm": 0.6988292198617666, + "learning_rate": 6.950609001902682e-07, + "loss": 1.4888, + "step": 810 + }, + { + "epoch": 0.05650977249764833, + "grad_norm": 0.7329883842362056, + "learning_rate": 6.950487294229662e-07, + "loss": 1.6594, + "step": 811 + }, + { + "epoch": 0.056579451625265655, + "grad_norm": 0.6939338590714141, + "learning_rate": 6.950365437974927e-07, + "loss": 1.3898, + "step": 812 + }, + { + "epoch": 0.05664913075288298, + "grad_norm": 0.7188599302209264, + "learning_rate": 6.950243433144314e-07, + "loss": 1.4821, + "step": 813 + }, + { + "epoch": 0.05671880988050029, + "grad_norm": 0.6823465353383991, + "learning_rate": 6.950121279743672e-07, + "loss": 1.5173, + "step": 814 + }, + { + "epoch": 0.056788489008117615, + "grad_norm": 0.8750761366460855, + "learning_rate": 6.949998977778852e-07, + "loss": 1.7818, + "step": 815 + }, + { + "epoch": 0.05685816813573494, + "grad_norm": 0.7137120708518332, + "learning_rate": 6.949876527255718e-07, + "loss": 1.6433, + "step": 816 + }, + { + "epoch": 0.05692784726335226, + "grad_norm": 0.6619871776167325, + "learning_rate": 6.949753928180137e-07, + "loss": 1.5437, + "step": 817 + }, + { + "epoch": 0.05699752639096958, + "grad_norm": 0.698883982747292, + "learning_rate": 6.949631180557984e-07, + "loss": 1.5743, + "step": 818 + }, + { + "epoch": 0.057067205518586905, + "grad_norm": 0.7606502997566646, + "learning_rate": 6.949508284395141e-07, + "loss": 1.6545, + "step": 819 + }, + { + "epoch": 0.05713688464620423, + "grad_norm": 0.714118076308743, + "learning_rate": 6.949385239697498e-07, + "loss": 1.4312, + "step": 820 + }, + { + "epoch": 0.05720656377382155, + "grad_norm": 0.7478782433732383, + "learning_rate": 6.949262046470951e-07, + "loss": 1.5869, + "step": 821 + }, + { + "epoch": 0.05727624290143887, + "grad_norm": 0.7041434815718658, + "learning_rate": 6.949138704721405e-07, + "loss": 1.5026, + "step": 822 + }, + { + "epoch": 0.057345922029056196, + "grad_norm": 0.7290786553345412, + "learning_rate": 6.94901521445477e-07, + "loss": 1.8279, + "step": 823 + }, + { + "epoch": 0.05741560115667352, + "grad_norm": 0.6990004793808002, + "learning_rate": 6.948891575676963e-07, + "loss": 1.4113, + "step": 824 + }, + { + "epoch": 0.05748528028429084, + "grad_norm": 0.7811171957869557, + "learning_rate": 6.94876778839391e-07, + "loss": 1.558, + "step": 825 + }, + { + "epoch": 0.05755495941190816, + "grad_norm": 0.739030141597572, + "learning_rate": 6.948643852611543e-07, + "loss": 1.5298, + "step": 826 + }, + { + "epoch": 0.057624638539525486, + "grad_norm": 0.7219462250643431, + "learning_rate": 6.948519768335801e-07, + "loss": 1.5291, + "step": 827 + }, + { + "epoch": 0.05769431766714281, + "grad_norm": 0.7120923451377806, + "learning_rate": 6.948395535572631e-07, + "loss": 1.4506, + "step": 828 + }, + { + "epoch": 0.05776399679476013, + "grad_norm": 0.6975168518442334, + "learning_rate": 6.948271154327985e-07, + "loss": 1.4987, + "step": 829 + }, + { + "epoch": 0.05783367592237745, + "grad_norm": 0.7138373985437595, + "learning_rate": 6.948146624607826e-07, + "loss": 1.6882, + "step": 830 + }, + { + "epoch": 0.057903355049994776, + "grad_norm": 0.6708575706578195, + "learning_rate": 6.948021946418118e-07, + "loss": 1.4863, + "step": 831 + }, + { + "epoch": 0.0579730341776121, + "grad_norm": 0.6808372768351861, + "learning_rate": 6.947897119764841e-07, + "loss": 1.5298, + "step": 832 + }, + { + "epoch": 0.05804271330522942, + "grad_norm": 0.6852255133243262, + "learning_rate": 6.947772144653973e-07, + "loss": 1.5228, + "step": 833 + }, + { + "epoch": 0.05811239243284674, + "grad_norm": 0.7146018666757779, + "learning_rate": 6.947647021091504e-07, + "loss": 1.5371, + "step": 834 + }, + { + "epoch": 0.058182071560464066, + "grad_norm": 0.7270596691308416, + "learning_rate": 6.947521749083431e-07, + "loss": 1.6466, + "step": 835 + }, + { + "epoch": 0.05825175068808139, + "grad_norm": 0.7603407007056545, + "learning_rate": 6.947396328635757e-07, + "loss": 1.6199, + "step": 836 + }, + { + "epoch": 0.058321429815698704, + "grad_norm": 0.6738987574655745, + "learning_rate": 6.947270759754491e-07, + "loss": 1.5234, + "step": 837 + }, + { + "epoch": 0.05839110894331603, + "grad_norm": 0.6897399123926644, + "learning_rate": 6.947145042445652e-07, + "loss": 1.4501, + "step": 838 + }, + { + "epoch": 0.05846078807093335, + "grad_norm": 0.7264844681997801, + "learning_rate": 6.947019176715265e-07, + "loss": 1.5785, + "step": 839 + }, + { + "epoch": 0.05853046719855067, + "grad_norm": 0.71216329594823, + "learning_rate": 6.94689316256936e-07, + "loss": 1.5699, + "step": 840 + }, + { + "epoch": 0.058600146326167994, + "grad_norm": 0.7269442447044326, + "learning_rate": 6.946767000013978e-07, + "loss": 1.5217, + "step": 841 + }, + { + "epoch": 0.05866982545378532, + "grad_norm": 0.7213513641430525, + "learning_rate": 6.946640689055163e-07, + "loss": 1.4736, + "step": 842 + }, + { + "epoch": 0.05873950458140264, + "grad_norm": 0.6911765142958883, + "learning_rate": 6.946514229698968e-07, + "loss": 1.5468, + "step": 843 + }, + { + "epoch": 0.05880918370901996, + "grad_norm": 0.7312402973375173, + "learning_rate": 6.946387621951456e-07, + "loss": 1.5845, + "step": 844 + }, + { + "epoch": 0.058878862836637284, + "grad_norm": 0.8766300017445524, + "learning_rate": 6.946260865818691e-07, + "loss": 1.6933, + "step": 845 + }, + { + "epoch": 0.05894854196425461, + "grad_norm": 0.7437506934142784, + "learning_rate": 6.946133961306748e-07, + "loss": 1.5705, + "step": 846 + }, + { + "epoch": 0.05901822109187193, + "grad_norm": 0.7250232099606774, + "learning_rate": 6.946006908421711e-07, + "loss": 1.6298, + "step": 847 + }, + { + "epoch": 0.05908790021948925, + "grad_norm": 0.7468248406429453, + "learning_rate": 6.945879707169668e-07, + "loss": 1.5957, + "step": 848 + }, + { + "epoch": 0.059157579347106574, + "grad_norm": 0.7307135773660111, + "learning_rate": 6.945752357556712e-07, + "loss": 1.6783, + "step": 849 + }, + { + "epoch": 0.0592272584747239, + "grad_norm": 0.7462954093323562, + "learning_rate": 6.945624859588947e-07, + "loss": 1.6313, + "step": 850 + }, + { + "epoch": 0.05929693760234122, + "grad_norm": 0.6836820982793157, + "learning_rate": 6.945497213272485e-07, + "loss": 1.5552, + "step": 851 + }, + { + "epoch": 0.05936661672995854, + "grad_norm": 0.7349495347711605, + "learning_rate": 6.94536941861344e-07, + "loss": 1.5534, + "step": 852 + }, + { + "epoch": 0.059436295857575865, + "grad_norm": 0.7393307515575156, + "learning_rate": 6.945241475617939e-07, + "loss": 1.5919, + "step": 853 + }, + { + "epoch": 0.05950597498519319, + "grad_norm": 0.7581161323825397, + "learning_rate": 6.945113384292112e-07, + "loss": 1.5157, + "step": 854 + }, + { + "epoch": 0.05957565411281051, + "grad_norm": 0.7811029541452695, + "learning_rate": 6.944985144642097e-07, + "loss": 1.5304, + "step": 855 + }, + { + "epoch": 0.05964533324042783, + "grad_norm": 0.6479169819808708, + "learning_rate": 6.94485675667404e-07, + "loss": 1.5384, + "step": 856 + }, + { + "epoch": 0.059715012368045155, + "grad_norm": 0.6754137449735433, + "learning_rate": 6.944728220394094e-07, + "loss": 1.5514, + "step": 857 + }, + { + "epoch": 0.05978469149566248, + "grad_norm": 0.7010515414317577, + "learning_rate": 6.944599535808418e-07, + "loss": 1.6216, + "step": 858 + }, + { + "epoch": 0.0598543706232798, + "grad_norm": 0.7420195687781036, + "learning_rate": 6.944470702923181e-07, + "loss": 1.4931, + "step": 859 + }, + { + "epoch": 0.059924049750897115, + "grad_norm": 0.6985723055107013, + "learning_rate": 6.944341721744553e-07, + "loss": 1.5086, + "step": 860 + }, + { + "epoch": 0.05999372887851444, + "grad_norm": 0.7496262938898509, + "learning_rate": 6.944212592278718e-07, + "loss": 1.5999, + "step": 861 + }, + { + "epoch": 0.06006340800613176, + "grad_norm": 0.6981277967509739, + "learning_rate": 6.944083314531863e-07, + "loss": 1.6053, + "step": 862 + }, + { + "epoch": 0.06013308713374908, + "grad_norm": 0.6804099181252625, + "learning_rate": 6.943953888510182e-07, + "loss": 1.5734, + "step": 863 + }, + { + "epoch": 0.060202766261366406, + "grad_norm": 0.6731032371257547, + "learning_rate": 6.943824314219881e-07, + "loss": 1.5698, + "step": 864 + }, + { + "epoch": 0.06027244538898373, + "grad_norm": 0.7425209366343755, + "learning_rate": 6.943694591667166e-07, + "loss": 1.5785, + "step": 865 + }, + { + "epoch": 0.06034212451660105, + "grad_norm": 0.7528653762418056, + "learning_rate": 6.943564720858257e-07, + "loss": 1.6388, + "step": 866 + }, + { + "epoch": 0.06041180364421837, + "grad_norm": 0.8122728565845222, + "learning_rate": 6.943434701799373e-07, + "loss": 1.625, + "step": 867 + }, + { + "epoch": 0.060481482771835696, + "grad_norm": 0.7019858720037662, + "learning_rate": 6.943304534496749e-07, + "loss": 1.564, + "step": 868 + }, + { + "epoch": 0.06055116189945302, + "grad_norm": 0.7177275391866323, + "learning_rate": 6.943174218956621e-07, + "loss": 1.6123, + "step": 869 + }, + { + "epoch": 0.06062084102707034, + "grad_norm": 0.7030350752558858, + "learning_rate": 6.943043755185235e-07, + "loss": 1.5799, + "step": 870 + }, + { + "epoch": 0.06069052015468766, + "grad_norm": 0.7320624667675142, + "learning_rate": 6.942913143188841e-07, + "loss": 1.626, + "step": 871 + }, + { + "epoch": 0.060760199282304986, + "grad_norm": 0.7741178657105833, + "learning_rate": 6.9427823829737e-07, + "loss": 1.6211, + "step": 872 + }, + { + "epoch": 0.06082987840992231, + "grad_norm": 0.7034164705948763, + "learning_rate": 6.942651474546077e-07, + "loss": 1.5162, + "step": 873 + }, + { + "epoch": 0.06089955753753963, + "grad_norm": 0.6909381638542067, + "learning_rate": 6.942520417912248e-07, + "loss": 1.4292, + "step": 874 + }, + { + "epoch": 0.06096923666515695, + "grad_norm": 0.7115351412753147, + "learning_rate": 6.94238921307849e-07, + "loss": 1.6299, + "step": 875 + }, + { + "epoch": 0.061038915792774276, + "grad_norm": 0.6704973088007256, + "learning_rate": 6.942257860051093e-07, + "loss": 1.6049, + "step": 876 + }, + { + "epoch": 0.0611085949203916, + "grad_norm": 0.7460113523362771, + "learning_rate": 6.942126358836352e-07, + "loss": 1.6023, + "step": 877 + }, + { + "epoch": 0.06117827404800892, + "grad_norm": 0.740997449770298, + "learning_rate": 6.941994709440567e-07, + "loss": 1.461, + "step": 878 + }, + { + "epoch": 0.061247953175626244, + "grad_norm": 0.7344001816527201, + "learning_rate": 6.941862911870047e-07, + "loss": 1.6357, + "step": 879 + }, + { + "epoch": 0.061317632303243566, + "grad_norm": 0.842515805713784, + "learning_rate": 6.941730966131111e-07, + "loss": 1.6079, + "step": 880 + }, + { + "epoch": 0.06138731143086089, + "grad_norm": 0.7541684280459339, + "learning_rate": 6.941598872230078e-07, + "loss": 1.4737, + "step": 881 + }, + { + "epoch": 0.06145699055847821, + "grad_norm": 0.7780595951607582, + "learning_rate": 6.941466630173281e-07, + "loss": 1.5857, + "step": 882 + }, + { + "epoch": 0.06152666968609553, + "grad_norm": 0.7922696893069073, + "learning_rate": 6.941334239967056e-07, + "loss": 1.6614, + "step": 883 + }, + { + "epoch": 0.06159634881371285, + "grad_norm": 0.7049532795712117, + "learning_rate": 6.941201701617749e-07, + "loss": 1.5411, + "step": 884 + }, + { + "epoch": 0.06166602794133017, + "grad_norm": 0.7030659598960232, + "learning_rate": 6.941069015131709e-07, + "loss": 1.6289, + "step": 885 + }, + { + "epoch": 0.061735707068947494, + "grad_norm": 0.7093629644917849, + "learning_rate": 6.940936180515296e-07, + "loss": 1.4986, + "step": 886 + }, + { + "epoch": 0.06180538619656482, + "grad_norm": 0.7310013628107739, + "learning_rate": 6.940803197774875e-07, + "loss": 1.7331, + "step": 887 + }, + { + "epoch": 0.06187506532418214, + "grad_norm": 0.7085823814611059, + "learning_rate": 6.94067006691682e-07, + "loss": 1.4859, + "step": 888 + }, + { + "epoch": 0.06194474445179946, + "grad_norm": 0.6963722191996505, + "learning_rate": 6.940536787947512e-07, + "loss": 1.6608, + "step": 889 + }, + { + "epoch": 0.062014423579416784, + "grad_norm": 0.7409922933496068, + "learning_rate": 6.940403360873335e-07, + "loss": 1.5919, + "step": 890 + }, + { + "epoch": 0.06208410270703411, + "grad_norm": 0.6655268228790038, + "learning_rate": 6.940269785700685e-07, + "loss": 1.507, + "step": 891 + }, + { + "epoch": 0.06215378183465143, + "grad_norm": 0.6821234570236667, + "learning_rate": 6.940136062435963e-07, + "loss": 1.6311, + "step": 892 + }, + { + "epoch": 0.06222346096226875, + "grad_norm": 0.6902129271155977, + "learning_rate": 6.940002191085575e-07, + "loss": 1.533, + "step": 893 + }, + { + "epoch": 0.062293140089886075, + "grad_norm": 0.7606556884733682, + "learning_rate": 6.93986817165594e-07, + "loss": 1.6512, + "step": 894 + }, + { + "epoch": 0.0623628192175034, + "grad_norm": 0.7289684165848465, + "learning_rate": 6.939734004153479e-07, + "loss": 1.5418, + "step": 895 + }, + { + "epoch": 0.06243249834512072, + "grad_norm": 0.6811870960828804, + "learning_rate": 6.939599688584621e-07, + "loss": 1.5511, + "step": 896 + }, + { + "epoch": 0.06250217747273804, + "grad_norm": 0.7242835339866917, + "learning_rate": 6.939465224955802e-07, + "loss": 1.4901, + "step": 897 + }, + { + "epoch": 0.06257185660035536, + "grad_norm": 0.7231645951362123, + "learning_rate": 6.939330613273468e-07, + "loss": 1.5142, + "step": 898 + }, + { + "epoch": 0.06264153572797268, + "grad_norm": 0.7434077826308495, + "learning_rate": 6.939195853544069e-07, + "loss": 1.5778, + "step": 899 + }, + { + "epoch": 0.06271121485559, + "grad_norm": 0.7528424466845036, + "learning_rate": 6.939060945774062e-07, + "loss": 1.5573, + "step": 900 + }, + { + "epoch": 0.06278089398320733, + "grad_norm": 0.6960916907060996, + "learning_rate": 6.938925889969913e-07, + "loss": 1.5107, + "step": 901 + }, + { + "epoch": 0.06285057311082465, + "grad_norm": 0.741434891143874, + "learning_rate": 6.938790686138093e-07, + "loss": 1.4976, + "step": 902 + }, + { + "epoch": 0.06292025223844197, + "grad_norm": 0.7758276506408398, + "learning_rate": 6.938655334285084e-07, + "loss": 1.7011, + "step": 903 + }, + { + "epoch": 0.06298993136605929, + "grad_norm": 0.7923740403380025, + "learning_rate": 6.938519834417369e-07, + "loss": 1.6621, + "step": 904 + }, + { + "epoch": 0.06305961049367662, + "grad_norm": 0.6856901148311233, + "learning_rate": 6.938384186541444e-07, + "loss": 1.6103, + "step": 905 + }, + { + "epoch": 0.06312928962129394, + "grad_norm": 0.702310601086839, + "learning_rate": 6.938248390663807e-07, + "loss": 1.5056, + "step": 906 + }, + { + "epoch": 0.06319896874891126, + "grad_norm": 0.7810829024837994, + "learning_rate": 6.938112446790969e-07, + "loss": 1.5446, + "step": 907 + }, + { + "epoch": 0.06326864787652858, + "grad_norm": 0.7307945024026805, + "learning_rate": 6.937976354929442e-07, + "loss": 1.6072, + "step": 908 + }, + { + "epoch": 0.0633383270041459, + "grad_norm": 0.7100956099681816, + "learning_rate": 6.937840115085747e-07, + "loss": 1.5953, + "step": 909 + }, + { + "epoch": 0.06340800613176323, + "grad_norm": 0.7112718468289873, + "learning_rate": 6.937703727266416e-07, + "loss": 1.5293, + "step": 910 + }, + { + "epoch": 0.06347768525938055, + "grad_norm": 0.6940826349677348, + "learning_rate": 6.937567191477984e-07, + "loss": 1.52, + "step": 911 + }, + { + "epoch": 0.06354736438699787, + "grad_norm": 0.7615667933385247, + "learning_rate": 6.937430507726993e-07, + "loss": 1.5318, + "step": 912 + }, + { + "epoch": 0.0636170435146152, + "grad_norm": 0.7576711542574699, + "learning_rate": 6.937293676019993e-07, + "loss": 1.6737, + "step": 913 + }, + { + "epoch": 0.06368672264223252, + "grad_norm": 0.7429248277684565, + "learning_rate": 6.937156696363543e-07, + "loss": 1.6404, + "step": 914 + }, + { + "epoch": 0.06375640176984984, + "grad_norm": 0.696155637296429, + "learning_rate": 6.937019568764206e-07, + "loss": 1.5436, + "step": 915 + }, + { + "epoch": 0.06382608089746716, + "grad_norm": 0.6842590034765481, + "learning_rate": 6.936882293228554e-07, + "loss": 1.5392, + "step": 916 + }, + { + "epoch": 0.06389576002508449, + "grad_norm": 0.696787602226188, + "learning_rate": 6.936744869763163e-07, + "loss": 1.5912, + "step": 917 + }, + { + "epoch": 0.06396543915270181, + "grad_norm": 0.7528902720982941, + "learning_rate": 6.936607298374624e-07, + "loss": 1.6004, + "step": 918 + }, + { + "epoch": 0.06403511828031913, + "grad_norm": 0.6598951020939933, + "learning_rate": 6.936469579069525e-07, + "loss": 1.5006, + "step": 919 + }, + { + "epoch": 0.06410479740793645, + "grad_norm": 0.6731494738244983, + "learning_rate": 6.936331711854467e-07, + "loss": 1.4561, + "step": 920 + }, + { + "epoch": 0.06417447653555378, + "grad_norm": 0.6743817096436955, + "learning_rate": 6.936193696736058e-07, + "loss": 1.5704, + "step": 921 + }, + { + "epoch": 0.0642441556631711, + "grad_norm": 0.7051720087157666, + "learning_rate": 6.936055533720911e-07, + "loss": 1.4952, + "step": 922 + }, + { + "epoch": 0.06431383479078842, + "grad_norm": 0.7103238647333436, + "learning_rate": 6.935917222815648e-07, + "loss": 1.5239, + "step": 923 + }, + { + "epoch": 0.06438351391840574, + "grad_norm": 0.7235088782382487, + "learning_rate": 6.935778764026895e-07, + "loss": 1.6262, + "step": 924 + }, + { + "epoch": 0.06445319304602307, + "grad_norm": 0.6819119127698352, + "learning_rate": 6.935640157361289e-07, + "loss": 1.5787, + "step": 925 + }, + { + "epoch": 0.06452287217364039, + "grad_norm": 0.7374359593151948, + "learning_rate": 6.935501402825473e-07, + "loss": 1.4686, + "step": 926 + }, + { + "epoch": 0.06459255130125771, + "grad_norm": 0.7116900330828307, + "learning_rate": 6.935362500426095e-07, + "loss": 1.5711, + "step": 927 + }, + { + "epoch": 0.06466223042887503, + "grad_norm": 0.6973009589376816, + "learning_rate": 6.935223450169812e-07, + "loss": 1.6242, + "step": 928 + }, + { + "epoch": 0.06473190955649236, + "grad_norm": 0.721322375011435, + "learning_rate": 6.935084252063286e-07, + "loss": 1.5486, + "step": 929 + }, + { + "epoch": 0.06480158868410968, + "grad_norm": 0.6756510474295269, + "learning_rate": 6.934944906113191e-07, + "loss": 1.4674, + "step": 930 + }, + { + "epoch": 0.064871267811727, + "grad_norm": 0.7197319416399377, + "learning_rate": 6.934805412326201e-07, + "loss": 1.5464, + "step": 931 + }, + { + "epoch": 0.06494094693934432, + "grad_norm": 0.7561165970179851, + "learning_rate": 6.934665770709004e-07, + "loss": 1.4934, + "step": 932 + }, + { + "epoch": 0.06501062606696165, + "grad_norm": 0.7184946003658444, + "learning_rate": 6.93452598126829e-07, + "loss": 1.39, + "step": 933 + }, + { + "epoch": 0.06508030519457897, + "grad_norm": 0.7180786135022353, + "learning_rate": 6.934386044010759e-07, + "loss": 1.6113, + "step": 934 + }, + { + "epoch": 0.06514998432219629, + "grad_norm": 0.7796809708168528, + "learning_rate": 6.934245958943115e-07, + "loss": 1.6264, + "step": 935 + }, + { + "epoch": 0.06521966344981361, + "grad_norm": 0.725538535253128, + "learning_rate": 6.934105726072076e-07, + "loss": 1.5312, + "step": 936 + }, + { + "epoch": 0.06528934257743094, + "grad_norm": 0.7304382528288118, + "learning_rate": 6.933965345404356e-07, + "loss": 1.5362, + "step": 937 + }, + { + "epoch": 0.06535902170504826, + "grad_norm": 0.700294818967333, + "learning_rate": 6.933824816946687e-07, + "loss": 1.4986, + "step": 938 + }, + { + "epoch": 0.06542870083266558, + "grad_norm": 0.6997315562329343, + "learning_rate": 6.933684140705801e-07, + "loss": 1.4747, + "step": 939 + }, + { + "epoch": 0.06549837996028289, + "grad_norm": 0.7567625481144566, + "learning_rate": 6.933543316688441e-07, + "loss": 1.5696, + "step": 940 + }, + { + "epoch": 0.06556805908790021, + "grad_norm": 41.44909914705615, + "learning_rate": 6.933402344901354e-07, + "loss": 1.6127, + "step": 941 + }, + { + "epoch": 0.06563773821551754, + "grad_norm": 0.6679798181768342, + "learning_rate": 6.933261225351298e-07, + "loss": 1.585, + "step": 942 + }, + { + "epoch": 0.06570741734313486, + "grad_norm": 0.7000648730835969, + "learning_rate": 6.933119958045033e-07, + "loss": 1.6216, + "step": 943 + }, + { + "epoch": 0.06577709647075218, + "grad_norm": 0.7596264387177684, + "learning_rate": 6.93297854298933e-07, + "loss": 1.6523, + "step": 944 + }, + { + "epoch": 0.0658467755983695, + "grad_norm": 0.7193619041313796, + "learning_rate": 6.932836980190967e-07, + "loss": 1.5485, + "step": 945 + }, + { + "epoch": 0.06591645472598683, + "grad_norm": 0.6980772094913987, + "learning_rate": 6.932695269656726e-07, + "loss": 1.5319, + "step": 946 + }, + { + "epoch": 0.06598613385360415, + "grad_norm": 0.7442746561075959, + "learning_rate": 6.9325534113934e-07, + "loss": 1.5161, + "step": 947 + }, + { + "epoch": 0.06605581298122147, + "grad_norm": 0.6777835337823753, + "learning_rate": 6.932411405407785e-07, + "loss": 1.4636, + "step": 948 + }, + { + "epoch": 0.0661254921088388, + "grad_norm": 0.6727156937868579, + "learning_rate": 6.932269251706688e-07, + "loss": 1.5201, + "step": 949 + }, + { + "epoch": 0.06619517123645612, + "grad_norm": 0.7245330036989707, + "learning_rate": 6.932126950296921e-07, + "loss": 1.5595, + "step": 950 + }, + { + "epoch": 0.06626485036407344, + "grad_norm": 0.7125701635864038, + "learning_rate": 6.931984501185303e-07, + "loss": 1.6323, + "step": 951 + }, + { + "epoch": 0.06633452949169076, + "grad_norm": 0.6963101718771975, + "learning_rate": 6.93184190437866e-07, + "loss": 1.513, + "step": 952 + }, + { + "epoch": 0.06640420861930808, + "grad_norm": 0.7046379018841911, + "learning_rate": 6.931699159883825e-07, + "loss": 1.5284, + "step": 953 + }, + { + "epoch": 0.0664738877469254, + "grad_norm": 0.678439137321281, + "learning_rate": 6.931556267707642e-07, + "loss": 1.553, + "step": 954 + }, + { + "epoch": 0.06654356687454273, + "grad_norm": 0.7410253132305197, + "learning_rate": 6.931413227856954e-07, + "loss": 1.5533, + "step": 955 + }, + { + "epoch": 0.06661324600216005, + "grad_norm": 0.6908842566143348, + "learning_rate": 6.93127004033862e-07, + "loss": 1.6341, + "step": 956 + }, + { + "epoch": 0.06668292512977737, + "grad_norm": 0.6651535227538429, + "learning_rate": 6.931126705159499e-07, + "loss": 1.5806, + "step": 957 + }, + { + "epoch": 0.0667526042573947, + "grad_norm": 0.6920497807137774, + "learning_rate": 6.930983222326462e-07, + "loss": 1.567, + "step": 958 + }, + { + "epoch": 0.06682228338501202, + "grad_norm": 0.7078371963892504, + "learning_rate": 6.930839591846383e-07, + "loss": 1.6668, + "step": 959 + }, + { + "epoch": 0.06689196251262934, + "grad_norm": 0.7606381115394008, + "learning_rate": 6.930695813726146e-07, + "loss": 1.4752, + "step": 960 + }, + { + "epoch": 0.06696164164024666, + "grad_norm": 0.8536717759212689, + "learning_rate": 6.93055188797264e-07, + "loss": 1.5652, + "step": 961 + }, + { + "epoch": 0.06703132076786399, + "grad_norm": 0.6653573860861989, + "learning_rate": 6.930407814592765e-07, + "loss": 1.5914, + "step": 962 + }, + { + "epoch": 0.06710099989548131, + "grad_norm": 0.784194439397854, + "learning_rate": 6.930263593593424e-07, + "loss": 1.5406, + "step": 963 + }, + { + "epoch": 0.06717067902309863, + "grad_norm": 0.7610703370050599, + "learning_rate": 6.930119224981526e-07, + "loss": 1.6075, + "step": 964 + }, + { + "epoch": 0.06724035815071595, + "grad_norm": 0.6764485214707169, + "learning_rate": 6.929974708763992e-07, + "loss": 1.504, + "step": 965 + }, + { + "epoch": 0.06731003727833328, + "grad_norm": 0.707512885228743, + "learning_rate": 6.929830044947746e-07, + "loss": 1.4592, + "step": 966 + }, + { + "epoch": 0.0673797164059506, + "grad_norm": 0.7081374537556824, + "learning_rate": 6.929685233539723e-07, + "loss": 1.6566, + "step": 967 + }, + { + "epoch": 0.06744939553356792, + "grad_norm": 0.7122482694793018, + "learning_rate": 6.929540274546861e-07, + "loss": 1.6348, + "step": 968 + }, + { + "epoch": 0.06751907466118524, + "grad_norm": 0.6901522900821034, + "learning_rate": 6.929395167976105e-07, + "loss": 1.5057, + "step": 969 + }, + { + "epoch": 0.06758875378880257, + "grad_norm": 0.7160908180373641, + "learning_rate": 6.929249913834413e-07, + "loss": 1.5376, + "step": 970 + }, + { + "epoch": 0.06765843291641989, + "grad_norm": 0.7388489184952017, + "learning_rate": 6.929104512128743e-07, + "loss": 1.4687, + "step": 971 + }, + { + "epoch": 0.06772811204403721, + "grad_norm": 0.7003909631678097, + "learning_rate": 6.928958962866063e-07, + "loss": 1.6329, + "step": 972 + }, + { + "epoch": 0.06779779117165453, + "grad_norm": 0.6912891435110838, + "learning_rate": 6.928813266053349e-07, + "loss": 1.4846, + "step": 973 + }, + { + "epoch": 0.06786747029927186, + "grad_norm": 0.6603982078975053, + "learning_rate": 6.928667421697582e-07, + "loss": 1.3881, + "step": 974 + }, + { + "epoch": 0.06793714942688918, + "grad_norm": 0.7235369345078301, + "learning_rate": 6.928521429805752e-07, + "loss": 1.6745, + "step": 975 + }, + { + "epoch": 0.0680068285545065, + "grad_norm": 0.7482907027017797, + "learning_rate": 6.928375290384856e-07, + "loss": 1.5114, + "step": 976 + }, + { + "epoch": 0.06807650768212382, + "grad_norm": 0.786232207346211, + "learning_rate": 6.928229003441894e-07, + "loss": 1.5673, + "step": 977 + }, + { + "epoch": 0.06814618680974115, + "grad_norm": 0.6986408216965335, + "learning_rate": 6.928082568983882e-07, + "loss": 1.6475, + "step": 978 + }, + { + "epoch": 0.06821586593735847, + "grad_norm": 0.7219767354753639, + "learning_rate": 6.927935987017831e-07, + "loss": 1.4191, + "step": 979 + }, + { + "epoch": 0.06828554506497579, + "grad_norm": 0.7733751013809093, + "learning_rate": 6.927789257550769e-07, + "loss": 1.4901, + "step": 980 + }, + { + "epoch": 0.06835522419259311, + "grad_norm": 0.7088273005920183, + "learning_rate": 6.927642380589728e-07, + "loss": 1.5104, + "step": 981 + }, + { + "epoch": 0.06842490332021044, + "grad_norm": 0.7231089020272345, + "learning_rate": 6.927495356141747e-07, + "loss": 1.565, + "step": 982 + }, + { + "epoch": 0.06849458244782776, + "grad_norm": 0.7465251630697766, + "learning_rate": 6.927348184213869e-07, + "loss": 1.6297, + "step": 983 + }, + { + "epoch": 0.06856426157544508, + "grad_norm": 0.7028021616887362, + "learning_rate": 6.927200864813149e-07, + "loss": 1.5676, + "step": 984 + }, + { + "epoch": 0.0686339407030624, + "grad_norm": 0.7241553816634947, + "learning_rate": 6.927053397946644e-07, + "loss": 1.5426, + "step": 985 + }, + { + "epoch": 0.06870361983067971, + "grad_norm": 0.6631488358648268, + "learning_rate": 6.926905783621427e-07, + "loss": 1.426, + "step": 986 + }, + { + "epoch": 0.06877329895829704, + "grad_norm": 0.7238485756150155, + "learning_rate": 6.926758021844565e-07, + "loss": 1.6008, + "step": 987 + }, + { + "epoch": 0.06884297808591436, + "grad_norm": 0.6720576579169943, + "learning_rate": 6.926610112623144e-07, + "loss": 1.4347, + "step": 988 + }, + { + "epoch": 0.06891265721353168, + "grad_norm": 0.73334548757339, + "learning_rate": 6.926462055964249e-07, + "loss": 1.5721, + "step": 989 + }, + { + "epoch": 0.068982336341149, + "grad_norm": 0.7137428718304452, + "learning_rate": 6.926313851874977e-07, + "loss": 1.6358, + "step": 990 + }, + { + "epoch": 0.06905201546876633, + "grad_norm": 0.7729856075932263, + "learning_rate": 6.92616550036243e-07, + "loss": 1.5841, + "step": 991 + }, + { + "epoch": 0.06912169459638365, + "grad_norm": 0.720906050858157, + "learning_rate": 6.926017001433716e-07, + "loss": 1.5314, + "step": 992 + }, + { + "epoch": 0.06919137372400097, + "grad_norm": 0.7376452297137691, + "learning_rate": 6.925868355095953e-07, + "loss": 1.5631, + "step": 993 + }, + { + "epoch": 0.0692610528516183, + "grad_norm": 0.7351626865087098, + "learning_rate": 6.925719561356263e-07, + "loss": 1.4988, + "step": 994 + }, + { + "epoch": 0.06933073197923562, + "grad_norm": 0.6997542125412789, + "learning_rate": 6.925570620221779e-07, + "loss": 1.5988, + "step": 995 + }, + { + "epoch": 0.06940041110685294, + "grad_norm": 0.706269039215262, + "learning_rate": 6.925421531699636e-07, + "loss": 1.5851, + "step": 996 + }, + { + "epoch": 0.06947009023447026, + "grad_norm": 0.6832486756386302, + "learning_rate": 6.925272295796979e-07, + "loss": 1.4754, + "step": 997 + }, + { + "epoch": 0.06953976936208758, + "grad_norm": 0.7294064047869246, + "learning_rate": 6.92512291252096e-07, + "loss": 1.5658, + "step": 998 + }, + { + "epoch": 0.0696094484897049, + "grad_norm": 0.7209483681033394, + "learning_rate": 6.924973381878738e-07, + "loss": 1.4977, + "step": 999 + }, + { + "epoch": 0.06967912761732223, + "grad_norm": 0.6482722642545093, + "learning_rate": 6.92482370387748e-07, + "loss": 1.5456, + "step": 1000 + }, + { + "epoch": 0.06974880674493955, + "grad_norm": 0.6911923296637908, + "learning_rate": 6.924673878524356e-07, + "loss": 1.5774, + "step": 1001 + }, + { + "epoch": 0.06981848587255687, + "grad_norm": 0.6806494876420273, + "learning_rate": 6.924523905826549e-07, + "loss": 1.6226, + "step": 1002 + }, + { + "epoch": 0.0698881650001742, + "grad_norm": 0.7206441347495618, + "learning_rate": 6.924373785791244e-07, + "loss": 1.5996, + "step": 1003 + }, + { + "epoch": 0.06995784412779152, + "grad_norm": 0.6568982429262141, + "learning_rate": 6.924223518425635e-07, + "loss": 1.5396, + "step": 1004 + }, + { + "epoch": 0.07002752325540884, + "grad_norm": 0.718836906784285, + "learning_rate": 6.924073103736925e-07, + "loss": 1.4372, + "step": 1005 + }, + { + "epoch": 0.07009720238302616, + "grad_norm": 0.7071783051387326, + "learning_rate": 6.92392254173232e-07, + "loss": 1.5001, + "step": 1006 + }, + { + "epoch": 0.07016688151064349, + "grad_norm": 0.722945628224867, + "learning_rate": 6.923771832419036e-07, + "loss": 1.691, + "step": 1007 + }, + { + "epoch": 0.07023656063826081, + "grad_norm": 0.783086373789463, + "learning_rate": 6.923620975804296e-07, + "loss": 1.568, + "step": 1008 + }, + { + "epoch": 0.07030623976587813, + "grad_norm": 0.6976889026374747, + "learning_rate": 6.923469971895328e-07, + "loss": 1.6341, + "step": 1009 + }, + { + "epoch": 0.07037591889349545, + "grad_norm": 0.7523424135440696, + "learning_rate": 6.923318820699369e-07, + "loss": 1.5886, + "step": 1010 + }, + { + "epoch": 0.07044559802111278, + "grad_norm": 0.7173214160819936, + "learning_rate": 6.923167522223664e-07, + "loss": 1.6562, + "step": 1011 + }, + { + "epoch": 0.0705152771487301, + "grad_norm": 0.7123339152044518, + "learning_rate": 6.923016076475462e-07, + "loss": 1.5676, + "step": 1012 + }, + { + "epoch": 0.07058495627634742, + "grad_norm": 0.7333638244874635, + "learning_rate": 6.92286448346202e-07, + "loss": 1.6151, + "step": 1013 + }, + { + "epoch": 0.07065463540396474, + "grad_norm": 0.7004839329730347, + "learning_rate": 6.922712743190605e-07, + "loss": 1.5125, + "step": 1014 + }, + { + "epoch": 0.07072431453158207, + "grad_norm": 0.744033569156811, + "learning_rate": 6.922560855668486e-07, + "loss": 1.579, + "step": 1015 + }, + { + "epoch": 0.07079399365919939, + "grad_norm": 0.6842826563152172, + "learning_rate": 6.922408820902942e-07, + "loss": 1.4839, + "step": 1016 + }, + { + "epoch": 0.07086367278681671, + "grad_norm": 0.6780606922456736, + "learning_rate": 6.922256638901262e-07, + "loss": 1.478, + "step": 1017 + }, + { + "epoch": 0.07093335191443403, + "grad_norm": 0.704981494096453, + "learning_rate": 6.922104309670736e-07, + "loss": 1.6119, + "step": 1018 + }, + { + "epoch": 0.07100303104205136, + "grad_norm": 0.7637589997063822, + "learning_rate": 6.921951833218664e-07, + "loss": 1.6386, + "step": 1019 + }, + { + "epoch": 0.07107271016966868, + "grad_norm": 0.6944249613797556, + "learning_rate": 6.921799209552354e-07, + "loss": 1.428, + "step": 1020 + }, + { + "epoch": 0.071142389297286, + "grad_norm": 0.6580765696200459, + "learning_rate": 6.921646438679119e-07, + "loss": 1.6064, + "step": 1021 + }, + { + "epoch": 0.07121206842490332, + "grad_norm": 0.6919362421338939, + "learning_rate": 6.921493520606281e-07, + "loss": 1.5615, + "step": 1022 + }, + { + "epoch": 0.07128174755252065, + "grad_norm": 0.6524789983946453, + "learning_rate": 6.921340455341168e-07, + "loss": 1.4955, + "step": 1023 + }, + { + "epoch": 0.07135142668013797, + "grad_norm": 0.7358174972537268, + "learning_rate": 6.921187242891115e-07, + "loss": 1.5882, + "step": 1024 + }, + { + "epoch": 0.07142110580775529, + "grad_norm": 0.7351799396329001, + "learning_rate": 6.921033883263464e-07, + "loss": 1.5962, + "step": 1025 + }, + { + "epoch": 0.07149078493537261, + "grad_norm": 0.7773506925419971, + "learning_rate": 6.920880376465565e-07, + "loss": 1.5903, + "step": 1026 + }, + { + "epoch": 0.07156046406298994, + "grad_norm": 0.720052376496024, + "learning_rate": 6.920726722504773e-07, + "loss": 1.5835, + "step": 1027 + }, + { + "epoch": 0.07163014319060726, + "grad_norm": 0.7122596855425408, + "learning_rate": 6.920572921388453e-07, + "loss": 1.4811, + "step": 1028 + }, + { + "epoch": 0.07169982231822458, + "grad_norm": 0.7530822747200974, + "learning_rate": 6.920418973123976e-07, + "loss": 1.6935, + "step": 1029 + }, + { + "epoch": 0.0717695014458419, + "grad_norm": 0.6738093485439884, + "learning_rate": 6.920264877718716e-07, + "loss": 1.5866, + "step": 1030 + }, + { + "epoch": 0.07183918057345923, + "grad_norm": 0.7484468258192855, + "learning_rate": 6.920110635180063e-07, + "loss": 1.6671, + "step": 1031 + }, + { + "epoch": 0.07190885970107654, + "grad_norm": 0.7419311209570713, + "learning_rate": 6.919956245515402e-07, + "loss": 1.6695, + "step": 1032 + }, + { + "epoch": 0.07197853882869386, + "grad_norm": 0.6826743849739504, + "learning_rate": 6.919801708732137e-07, + "loss": 1.5021, + "step": 1033 + }, + { + "epoch": 0.07204821795631118, + "grad_norm": 0.7000484130543231, + "learning_rate": 6.919647024837673e-07, + "loss": 1.5443, + "step": 1034 + }, + { + "epoch": 0.0721178970839285, + "grad_norm": 0.6954818658655855, + "learning_rate": 6.91949219383942e-07, + "loss": 1.4374, + "step": 1035 + }, + { + "epoch": 0.07218757621154583, + "grad_norm": 0.715842381283248, + "learning_rate": 6.9193372157448e-07, + "loss": 1.5712, + "step": 1036 + }, + { + "epoch": 0.07225725533916315, + "grad_norm": 0.6767161402869131, + "learning_rate": 6.919182090561241e-07, + "loss": 1.3916, + "step": 1037 + }, + { + "epoch": 0.07232693446678047, + "grad_norm": 0.6910566641237087, + "learning_rate": 6.919026818296173e-07, + "loss": 1.5389, + "step": 1038 + }, + { + "epoch": 0.0723966135943978, + "grad_norm": 0.7341954371107908, + "learning_rate": 6.91887139895704e-07, + "loss": 1.5189, + "step": 1039 + }, + { + "epoch": 0.07246629272201512, + "grad_norm": 0.7214373950468113, + "learning_rate": 6.91871583255129e-07, + "loss": 1.5847, + "step": 1040 + }, + { + "epoch": 0.07253597184963244, + "grad_norm": 0.7014361915993079, + "learning_rate": 6.918560119086376e-07, + "loss": 1.4826, + "step": 1041 + }, + { + "epoch": 0.07260565097724976, + "grad_norm": 0.7704601014028651, + "learning_rate": 6.918404258569763e-07, + "loss": 1.468, + "step": 1042 + }, + { + "epoch": 0.07267533010486708, + "grad_norm": 0.808693366876018, + "learning_rate": 6.918248251008917e-07, + "loss": 1.6654, + "step": 1043 + }, + { + "epoch": 0.0727450092324844, + "grad_norm": 0.7110048932554214, + "learning_rate": 6.918092096411318e-07, + "loss": 1.5662, + "step": 1044 + }, + { + "epoch": 0.07281468836010173, + "grad_norm": 0.6982849324228914, + "learning_rate": 6.917935794784445e-07, + "loss": 1.5331, + "step": 1045 + }, + { + "epoch": 0.07288436748771905, + "grad_norm": 0.7381550381399294, + "learning_rate": 6.917779346135791e-07, + "loss": 1.5555, + "step": 1046 + }, + { + "epoch": 0.07295404661533637, + "grad_norm": 0.704548114293804, + "learning_rate": 6.917622750472852e-07, + "loss": 1.6146, + "step": 1047 + }, + { + "epoch": 0.0730237257429537, + "grad_norm": 0.7042543974542429, + "learning_rate": 6.917466007803135e-07, + "loss": 1.3595, + "step": 1048 + }, + { + "epoch": 0.07309340487057102, + "grad_norm": 0.7267693860346068, + "learning_rate": 6.917309118134148e-07, + "loss": 1.6352, + "step": 1049 + }, + { + "epoch": 0.07316308399818834, + "grad_norm": 0.7426775788118576, + "learning_rate": 6.917152081473412e-07, + "loss": 1.6405, + "step": 1050 + }, + { + "epoch": 0.07323276312580566, + "grad_norm": 0.7070458152085065, + "learning_rate": 6.91699489782845e-07, + "loss": 1.4703, + "step": 1051 + }, + { + "epoch": 0.07330244225342299, + "grad_norm": 0.7417121840086736, + "learning_rate": 6.916837567206797e-07, + "loss": 1.5257, + "step": 1052 + }, + { + "epoch": 0.07337212138104031, + "grad_norm": 0.7486626599689458, + "learning_rate": 6.916680089615992e-07, + "loss": 1.469, + "step": 1053 + }, + { + "epoch": 0.07344180050865763, + "grad_norm": 0.6854908185511666, + "learning_rate": 6.916522465063581e-07, + "loss": 1.487, + "step": 1054 + }, + { + "epoch": 0.07351147963627495, + "grad_norm": 0.7121319583710837, + "learning_rate": 6.916364693557117e-07, + "loss": 1.5364, + "step": 1055 + }, + { + "epoch": 0.07358115876389228, + "grad_norm": 0.7287291289514267, + "learning_rate": 6.916206775104164e-07, + "loss": 1.5476, + "step": 1056 + }, + { + "epoch": 0.0736508378915096, + "grad_norm": 0.744317342873817, + "learning_rate": 6.916048709712286e-07, + "loss": 1.5123, + "step": 1057 + }, + { + "epoch": 0.07372051701912692, + "grad_norm": 0.7352301107365474, + "learning_rate": 6.915890497389059e-07, + "loss": 1.5221, + "step": 1058 + }, + { + "epoch": 0.07379019614674424, + "grad_norm": 0.667449801712433, + "learning_rate": 6.915732138142066e-07, + "loss": 1.5428, + "step": 1059 + }, + { + "epoch": 0.07385987527436157, + "grad_norm": 0.8240861662656697, + "learning_rate": 6.915573631978896e-07, + "loss": 1.5278, + "step": 1060 + }, + { + "epoch": 0.07392955440197889, + "grad_norm": 0.8115926103137241, + "learning_rate": 6.915414978907143e-07, + "loss": 1.5511, + "step": 1061 + }, + { + "epoch": 0.07399923352959621, + "grad_norm": 0.7115918276627143, + "learning_rate": 6.915256178934411e-07, + "loss": 1.5539, + "step": 1062 + }, + { + "epoch": 0.07406891265721353, + "grad_norm": 0.7231534378390903, + "learning_rate": 6.915097232068309e-07, + "loss": 1.4308, + "step": 1063 + }, + { + "epoch": 0.07413859178483086, + "grad_norm": 0.6849196644707507, + "learning_rate": 6.914938138316456e-07, + "loss": 1.5336, + "step": 1064 + }, + { + "epoch": 0.07420827091244818, + "grad_norm": 0.712647033269892, + "learning_rate": 6.914778897686477e-07, + "loss": 1.4848, + "step": 1065 + }, + { + "epoch": 0.0742779500400655, + "grad_norm": 0.6841995015789953, + "learning_rate": 6.914619510185999e-07, + "loss": 1.5199, + "step": 1066 + }, + { + "epoch": 0.07434762916768282, + "grad_norm": 0.7032621096597862, + "learning_rate": 6.914459975822664e-07, + "loss": 1.5288, + "step": 1067 + }, + { + "epoch": 0.07441730829530015, + "grad_norm": 0.8064800483185524, + "learning_rate": 6.914300294604115e-07, + "loss": 1.5849, + "step": 1068 + }, + { + "epoch": 0.07448698742291747, + "grad_norm": 0.6616858768991769, + "learning_rate": 6.914140466538005e-07, + "loss": 1.5662, + "step": 1069 + }, + { + "epoch": 0.07455666655053479, + "grad_norm": 0.6981029713873405, + "learning_rate": 6.913980491631993e-07, + "loss": 1.5105, + "step": 1070 + }, + { + "epoch": 0.07462634567815211, + "grad_norm": 0.7960162371795243, + "learning_rate": 6.913820369893746e-07, + "loss": 1.511, + "step": 1071 + }, + { + "epoch": 0.07469602480576944, + "grad_norm": 0.696388376902543, + "learning_rate": 6.913660101330937e-07, + "loss": 1.4846, + "step": 1072 + }, + { + "epoch": 0.07476570393338676, + "grad_norm": 0.7920768720605448, + "learning_rate": 6.913499685951247e-07, + "loss": 1.5168, + "step": 1073 + }, + { + "epoch": 0.07483538306100408, + "grad_norm": 0.694588502839473, + "learning_rate": 6.913339123762361e-07, + "loss": 1.5247, + "step": 1074 + }, + { + "epoch": 0.0749050621886214, + "grad_norm": 0.7466383220743256, + "learning_rate": 6.913178414771977e-07, + "loss": 1.6722, + "step": 1075 + }, + { + "epoch": 0.07497474131623873, + "grad_norm": 0.7403064737880765, + "learning_rate": 6.913017558987794e-07, + "loss": 1.5448, + "step": 1076 + }, + { + "epoch": 0.07504442044385605, + "grad_norm": 0.7088682271914373, + "learning_rate": 6.912856556417521e-07, + "loss": 1.518, + "step": 1077 + }, + { + "epoch": 0.07511409957147336, + "grad_norm": 0.7251852788383186, + "learning_rate": 6.912695407068875e-07, + "loss": 1.4666, + "step": 1078 + }, + { + "epoch": 0.07518377869909068, + "grad_norm": 0.7118843231495031, + "learning_rate": 6.912534110949577e-07, + "loss": 1.5608, + "step": 1079 + }, + { + "epoch": 0.075253457826708, + "grad_norm": 0.7373827534866939, + "learning_rate": 6.912372668067356e-07, + "loss": 1.56, + "step": 1080 + }, + { + "epoch": 0.07532313695432533, + "grad_norm": 0.6652634128463273, + "learning_rate": 6.912211078429952e-07, + "loss": 1.4923, + "step": 1081 + }, + { + "epoch": 0.07539281608194265, + "grad_norm": 0.7210995227862197, + "learning_rate": 6.912049342045104e-07, + "loss": 1.6125, + "step": 1082 + }, + { + "epoch": 0.07546249520955997, + "grad_norm": 0.7873573632536712, + "learning_rate": 6.911887458920568e-07, + "loss": 1.6104, + "step": 1083 + }, + { + "epoch": 0.0755321743371773, + "grad_norm": 0.670639410874111, + "learning_rate": 6.911725429064096e-07, + "loss": 1.5491, + "step": 1084 + }, + { + "epoch": 0.07560185346479462, + "grad_norm": 0.7412530911154177, + "learning_rate": 6.911563252483458e-07, + "loss": 1.5383, + "step": 1085 + }, + { + "epoch": 0.07567153259241194, + "grad_norm": 0.8068313122574107, + "learning_rate": 6.911400929186425e-07, + "loss": 1.5513, + "step": 1086 + }, + { + "epoch": 0.07574121172002926, + "grad_norm": 0.721225693696014, + "learning_rate": 6.911238459180772e-07, + "loss": 1.5331, + "step": 1087 + }, + { + "epoch": 0.07581089084764658, + "grad_norm": 0.7167014213265765, + "learning_rate": 6.911075842474287e-07, + "loss": 1.6393, + "step": 1088 + }, + { + "epoch": 0.0758805699752639, + "grad_norm": 0.7171069278949371, + "learning_rate": 6.910913079074765e-07, + "loss": 1.549, + "step": 1089 + }, + { + "epoch": 0.07595024910288123, + "grad_norm": 0.6934862794176397, + "learning_rate": 6.910750168990005e-07, + "loss": 1.4912, + "step": 1090 + }, + { + "epoch": 0.07601992823049855, + "grad_norm": 0.7197521911287685, + "learning_rate": 6.910587112227811e-07, + "loss": 1.5344, + "step": 1091 + }, + { + "epoch": 0.07608960735811587, + "grad_norm": 0.7464831626664131, + "learning_rate": 6.910423908796001e-07, + "loss": 1.4993, + "step": 1092 + }, + { + "epoch": 0.0761592864857332, + "grad_norm": 0.7137983125771801, + "learning_rate": 6.910260558702393e-07, + "loss": 1.6077, + "step": 1093 + }, + { + "epoch": 0.07622896561335052, + "grad_norm": 0.7084434020338641, + "learning_rate": 6.910097061954817e-07, + "loss": 1.5826, + "step": 1094 + }, + { + "epoch": 0.07629864474096784, + "grad_norm": 0.6707576740728572, + "learning_rate": 6.909933418561109e-07, + "loss": 1.5207, + "step": 1095 + }, + { + "epoch": 0.07636832386858516, + "grad_norm": 0.701643205980676, + "learning_rate": 6.909769628529107e-07, + "loss": 1.5677, + "step": 1096 + }, + { + "epoch": 0.07643800299620249, + "grad_norm": 0.6960908138813413, + "learning_rate": 6.909605691866665e-07, + "loss": 1.5405, + "step": 1097 + }, + { + "epoch": 0.07650768212381981, + "grad_norm": 0.7409390432569176, + "learning_rate": 6.909441608581636e-07, + "loss": 1.6348, + "step": 1098 + }, + { + "epoch": 0.07657736125143713, + "grad_norm": 0.7330616230821628, + "learning_rate": 6.909277378681885e-07, + "loss": 1.6215, + "step": 1099 + }, + { + "epoch": 0.07664704037905445, + "grad_norm": 0.706598439534836, + "learning_rate": 6.909113002175281e-07, + "loss": 1.5915, + "step": 1100 + }, + { + "epoch": 0.07671671950667178, + "grad_norm": 0.7230263429802867, + "learning_rate": 6.908948479069701e-07, + "loss": 1.5836, + "step": 1101 + }, + { + "epoch": 0.0767863986342891, + "grad_norm": 0.6871898211556535, + "learning_rate": 6.908783809373031e-07, + "loss": 1.5958, + "step": 1102 + }, + { + "epoch": 0.07685607776190642, + "grad_norm": 0.7139068952007671, + "learning_rate": 6.908618993093161e-07, + "loss": 1.6262, + "step": 1103 + }, + { + "epoch": 0.07692575688952374, + "grad_norm": 0.6569664827836058, + "learning_rate": 6.90845403023799e-07, + "loss": 1.5115, + "step": 1104 + }, + { + "epoch": 0.07699543601714107, + "grad_norm": 0.7253554862037589, + "learning_rate": 6.908288920815422e-07, + "loss": 1.529, + "step": 1105 + }, + { + "epoch": 0.07706511514475839, + "grad_norm": 0.6970591877647012, + "learning_rate": 6.90812366483337e-07, + "loss": 1.6525, + "step": 1106 + }, + { + "epoch": 0.07713479427237571, + "grad_norm": 0.6659396879644911, + "learning_rate": 6.907958262299755e-07, + "loss": 1.5162, + "step": 1107 + }, + { + "epoch": 0.07720447339999303, + "grad_norm": 0.6871713539255694, + "learning_rate": 6.907792713222501e-07, + "loss": 1.6047, + "step": 1108 + }, + { + "epoch": 0.07727415252761036, + "grad_norm": 0.7116167931927397, + "learning_rate": 6.907627017609543e-07, + "loss": 1.485, + "step": 1109 + }, + { + "epoch": 0.07734383165522768, + "grad_norm": 0.7053043605198956, + "learning_rate": 6.907461175468822e-07, + "loss": 1.4887, + "step": 1110 + }, + { + "epoch": 0.077413510782845, + "grad_norm": 0.6875986075874049, + "learning_rate": 6.907295186808285e-07, + "loss": 1.5854, + "step": 1111 + }, + { + "epoch": 0.07748318991046232, + "grad_norm": 0.7028170272289873, + "learning_rate": 6.907129051635885e-07, + "loss": 1.6719, + "step": 1112 + }, + { + "epoch": 0.07755286903807965, + "grad_norm": 0.7086162175094449, + "learning_rate": 6.906962769959585e-07, + "loss": 1.5725, + "step": 1113 + }, + { + "epoch": 0.07762254816569697, + "grad_norm": 0.7438672634911797, + "learning_rate": 6.906796341787353e-07, + "loss": 1.5896, + "step": 1114 + }, + { + "epoch": 0.07769222729331429, + "grad_norm": 0.6975872227510016, + "learning_rate": 6.906629767127165e-07, + "loss": 1.5404, + "step": 1115 + }, + { + "epoch": 0.07776190642093161, + "grad_norm": 0.7077339896057622, + "learning_rate": 6.906463045987004e-07, + "loss": 1.6565, + "step": 1116 + }, + { + "epoch": 0.07783158554854894, + "grad_norm": 0.7147410083240348, + "learning_rate": 6.906296178374858e-07, + "loss": 1.4631, + "step": 1117 + }, + { + "epoch": 0.07790126467616626, + "grad_norm": 0.7178738712503007, + "learning_rate": 6.906129164298726e-07, + "loss": 1.6519, + "step": 1118 + }, + { + "epoch": 0.07797094380378358, + "grad_norm": 0.7567807790555247, + "learning_rate": 6.905962003766609e-07, + "loss": 1.606, + "step": 1119 + }, + { + "epoch": 0.0780406229314009, + "grad_norm": 0.7677258840305423, + "learning_rate": 6.90579469678652e-07, + "loss": 1.514, + "step": 1120 + }, + { + "epoch": 0.07811030205901823, + "grad_norm": 0.7107671421573318, + "learning_rate": 6.905627243366474e-07, + "loss": 1.7219, + "step": 1121 + }, + { + "epoch": 0.07817998118663555, + "grad_norm": 0.7435836249390709, + "learning_rate": 6.905459643514499e-07, + "loss": 1.5213, + "step": 1122 + }, + { + "epoch": 0.07824966031425287, + "grad_norm": 0.7182999889998696, + "learning_rate": 6.905291897238625e-07, + "loss": 1.5407, + "step": 1123 + }, + { + "epoch": 0.07831933944187018, + "grad_norm": 0.7416359447410312, + "learning_rate": 6.905124004546891e-07, + "loss": 1.5425, + "step": 1124 + }, + { + "epoch": 0.0783890185694875, + "grad_norm": 0.757556656682122, + "learning_rate": 6.904955965447342e-07, + "loss": 1.5452, + "step": 1125 + }, + { + "epoch": 0.07845869769710483, + "grad_norm": 0.6606122586088298, + "learning_rate": 6.904787779948031e-07, + "loss": 1.4316, + "step": 1126 + }, + { + "epoch": 0.07852837682472215, + "grad_norm": 0.7167556172667429, + "learning_rate": 6.90461944805702e-07, + "loss": 1.5075, + "step": 1127 + }, + { + "epoch": 0.07859805595233947, + "grad_norm": 0.641614834382918, + "learning_rate": 6.904450969782374e-07, + "loss": 1.4476, + "step": 1128 + }, + { + "epoch": 0.0786677350799568, + "grad_norm": 0.7465073966135141, + "learning_rate": 6.904282345132164e-07, + "loss": 1.6159, + "step": 1129 + }, + { + "epoch": 0.07873741420757412, + "grad_norm": 0.7399215393549937, + "learning_rate": 6.904113574114476e-07, + "loss": 1.64, + "step": 1130 + }, + { + "epoch": 0.07880709333519144, + "grad_norm": 0.6855077939067492, + "learning_rate": 6.903944656737396e-07, + "loss": 1.4624, + "step": 1131 + }, + { + "epoch": 0.07887677246280876, + "grad_norm": 0.7251989381893763, + "learning_rate": 6.903775593009017e-07, + "loss": 1.5619, + "step": 1132 + }, + { + "epoch": 0.07894645159042608, + "grad_norm": 0.6915517066303697, + "learning_rate": 6.903606382937443e-07, + "loss": 1.4697, + "step": 1133 + }, + { + "epoch": 0.0790161307180434, + "grad_norm": 0.7016250746878615, + "learning_rate": 6.903437026530782e-07, + "loss": 1.4792, + "step": 1134 + }, + { + "epoch": 0.07908580984566073, + "grad_norm": 0.7219157858272937, + "learning_rate": 6.90326752379715e-07, + "loss": 1.6381, + "step": 1135 + }, + { + "epoch": 0.07915548897327805, + "grad_norm": 0.7397081395480988, + "learning_rate": 6.90309787474467e-07, + "loss": 1.5577, + "step": 1136 + }, + { + "epoch": 0.07922516810089537, + "grad_norm": 0.7266051003891588, + "learning_rate": 6.902928079381473e-07, + "loss": 1.6519, + "step": 1137 + }, + { + "epoch": 0.0792948472285127, + "grad_norm": 0.7290609479762402, + "learning_rate": 6.902758137715693e-07, + "loss": 1.5796, + "step": 1138 + }, + { + "epoch": 0.07936452635613002, + "grad_norm": 0.6567695028044657, + "learning_rate": 6.902588049755478e-07, + "loss": 1.501, + "step": 1139 + }, + { + "epoch": 0.07943420548374734, + "grad_norm": 0.7077836191905298, + "learning_rate": 6.902417815508975e-07, + "loss": 1.5554, + "step": 1140 + }, + { + "epoch": 0.07950388461136466, + "grad_norm": 0.640089811517191, + "learning_rate": 6.902247434984345e-07, + "loss": 1.4511, + "step": 1141 + }, + { + "epoch": 0.07957356373898199, + "grad_norm": 0.7397155755165485, + "learning_rate": 6.902076908189751e-07, + "loss": 1.6186, + "step": 1142 + }, + { + "epoch": 0.07964324286659931, + "grad_norm": 0.7370105780872295, + "learning_rate": 6.901906235133365e-07, + "loss": 1.5568, + "step": 1143 + }, + { + "epoch": 0.07971292199421663, + "grad_norm": 0.678857242792769, + "learning_rate": 6.90173541582337e-07, + "loss": 1.5696, + "step": 1144 + }, + { + "epoch": 0.07978260112183395, + "grad_norm": 0.7033520068281577, + "learning_rate": 6.901564450267946e-07, + "loss": 1.6165, + "step": 1145 + }, + { + "epoch": 0.07985228024945128, + "grad_norm": 0.7137263676325256, + "learning_rate": 6.901393338475288e-07, + "loss": 1.6099, + "step": 1146 + }, + { + "epoch": 0.0799219593770686, + "grad_norm": 0.7144404342321975, + "learning_rate": 6.901222080453598e-07, + "loss": 1.5078, + "step": 1147 + }, + { + "epoch": 0.07999163850468592, + "grad_norm": 0.718330695095129, + "learning_rate": 6.901050676211082e-07, + "loss": 1.5648, + "step": 1148 + }, + { + "epoch": 0.08006131763230324, + "grad_norm": 0.7194005716639188, + "learning_rate": 6.900879125755955e-07, + "loss": 1.511, + "step": 1149 + }, + { + "epoch": 0.08013099675992057, + "grad_norm": 0.7678458971817134, + "learning_rate": 6.900707429096435e-07, + "loss": 1.5421, + "step": 1150 + }, + { + "epoch": 0.08020067588753789, + "grad_norm": 0.6985402111375227, + "learning_rate": 6.900535586240753e-07, + "loss": 1.539, + "step": 1151 + }, + { + "epoch": 0.08027035501515521, + "grad_norm": 0.6740543021552393, + "learning_rate": 6.900363597197144e-07, + "loss": 1.544, + "step": 1152 + }, + { + "epoch": 0.08034003414277253, + "grad_norm": 0.7044355580971693, + "learning_rate": 6.900191461973848e-07, + "loss": 1.5279, + "step": 1153 + }, + { + "epoch": 0.08040971327038986, + "grad_norm": 0.7132437628563113, + "learning_rate": 6.900019180579115e-07, + "loss": 1.5389, + "step": 1154 + }, + { + "epoch": 0.08047939239800718, + "grad_norm": 0.6845257281834255, + "learning_rate": 6.899846753021202e-07, + "loss": 1.576, + "step": 1155 + }, + { + "epoch": 0.0805490715256245, + "grad_norm": 0.7459578464387648, + "learning_rate": 6.899674179308371e-07, + "loss": 1.4598, + "step": 1156 + }, + { + "epoch": 0.08061875065324182, + "grad_norm": 0.7157920946244044, + "learning_rate": 6.899501459448892e-07, + "loss": 1.534, + "step": 1157 + }, + { + "epoch": 0.08068842978085915, + "grad_norm": 0.8475637704029476, + "learning_rate": 6.899328593451042e-07, + "loss": 1.5881, + "step": 1158 + }, + { + "epoch": 0.08075810890847647, + "grad_norm": 0.7428948647025614, + "learning_rate": 6.899155581323108e-07, + "loss": 1.5358, + "step": 1159 + }, + { + "epoch": 0.08082778803609379, + "grad_norm": 0.6865769571057071, + "learning_rate": 6.898982423073378e-07, + "loss": 1.5509, + "step": 1160 + }, + { + "epoch": 0.08089746716371111, + "grad_norm": 0.7284383625362557, + "learning_rate": 6.898809118710149e-07, + "loss": 1.6219, + "step": 1161 + }, + { + "epoch": 0.08096714629132844, + "grad_norm": 0.7222780249612296, + "learning_rate": 6.89863566824173e-07, + "loss": 1.6027, + "step": 1162 + }, + { + "epoch": 0.08103682541894576, + "grad_norm": 0.7075963278835353, + "learning_rate": 6.89846207167643e-07, + "loss": 1.5669, + "step": 1163 + }, + { + "epoch": 0.08110650454656308, + "grad_norm": 0.7514979660782378, + "learning_rate": 6.89828832902257e-07, + "loss": 1.6114, + "step": 1164 + }, + { + "epoch": 0.0811761836741804, + "grad_norm": 0.6928840372383243, + "learning_rate": 6.898114440288475e-07, + "loss": 1.5779, + "step": 1165 + }, + { + "epoch": 0.08124586280179773, + "grad_norm": 0.7625255343114417, + "learning_rate": 6.897940405482478e-07, + "loss": 1.5837, + "step": 1166 + }, + { + "epoch": 0.08131554192941505, + "grad_norm": 0.6853653653010325, + "learning_rate": 6.897766224612919e-07, + "loss": 1.5777, + "step": 1167 + }, + { + "epoch": 0.08138522105703237, + "grad_norm": 0.7381018328852454, + "learning_rate": 6.897591897688147e-07, + "loss": 1.5864, + "step": 1168 + }, + { + "epoch": 0.0814549001846497, + "grad_norm": 0.6959768347113379, + "learning_rate": 6.897417424716513e-07, + "loss": 1.6115, + "step": 1169 + }, + { + "epoch": 0.081524579312267, + "grad_norm": 0.6895660442037262, + "learning_rate": 6.897242805706381e-07, + "loss": 1.5186, + "step": 1170 + }, + { + "epoch": 0.08159425843988433, + "grad_norm": 0.7519857537232603, + "learning_rate": 6.897068040666117e-07, + "loss": 1.5396, + "step": 1171 + }, + { + "epoch": 0.08166393756750165, + "grad_norm": 0.718532609238401, + "learning_rate": 6.896893129604098e-07, + "loss": 1.5395, + "step": 1172 + }, + { + "epoch": 0.08173361669511897, + "grad_norm": 0.7399091934568263, + "learning_rate": 6.896718072528704e-07, + "loss": 1.621, + "step": 1173 + }, + { + "epoch": 0.0818032958227363, + "grad_norm": 0.6925568720224577, + "learning_rate": 6.896542869448327e-07, + "loss": 1.5752, + "step": 1174 + }, + { + "epoch": 0.08187297495035362, + "grad_norm": 0.7067599488817029, + "learning_rate": 6.896367520371359e-07, + "loss": 1.5278, + "step": 1175 + }, + { + "epoch": 0.08194265407797094, + "grad_norm": 0.7103145143633663, + "learning_rate": 6.896192025306206e-07, + "loss": 1.6391, + "step": 1176 + }, + { + "epoch": 0.08201233320558826, + "grad_norm": 0.7734153160510779, + "learning_rate": 6.896016384261277e-07, + "loss": 1.5156, + "step": 1177 + }, + { + "epoch": 0.08208201233320558, + "grad_norm": 0.6859669641724797, + "learning_rate": 6.895840597244991e-07, + "loss": 1.5193, + "step": 1178 + }, + { + "epoch": 0.0821516914608229, + "grad_norm": 0.673186710234392, + "learning_rate": 6.89566466426577e-07, + "loss": 1.3677, + "step": 1179 + }, + { + "epoch": 0.08222137058844023, + "grad_norm": 0.7640001453836751, + "learning_rate": 6.895488585332045e-07, + "loss": 1.6201, + "step": 1180 + }, + { + "epoch": 0.08229104971605755, + "grad_norm": 0.7061881267193482, + "learning_rate": 6.895312360452255e-07, + "loss": 1.5405, + "step": 1181 + }, + { + "epoch": 0.08236072884367487, + "grad_norm": 0.7576495015869381, + "learning_rate": 6.895135989634845e-07, + "loss": 1.6068, + "step": 1182 + }, + { + "epoch": 0.0824304079712922, + "grad_norm": 0.8049680580056225, + "learning_rate": 6.894959472888267e-07, + "loss": 1.6249, + "step": 1183 + }, + { + "epoch": 0.08250008709890952, + "grad_norm": 0.7441843872012226, + "learning_rate": 6.894782810220979e-07, + "loss": 1.6032, + "step": 1184 + }, + { + "epoch": 0.08256976622652684, + "grad_norm": 0.6633384020829896, + "learning_rate": 6.89460600164145e-07, + "loss": 1.5106, + "step": 1185 + }, + { + "epoch": 0.08263944535414416, + "grad_norm": 0.8483096471103987, + "learning_rate": 6.89442904715815e-07, + "loss": 1.5806, + "step": 1186 + }, + { + "epoch": 0.08270912448176149, + "grad_norm": 0.6666047476495981, + "learning_rate": 6.894251946779559e-07, + "loss": 1.442, + "step": 1187 + }, + { + "epoch": 0.08277880360937881, + "grad_norm": 0.7289293352898126, + "learning_rate": 6.894074700514165e-07, + "loss": 1.5653, + "step": 1188 + }, + { + "epoch": 0.08284848273699613, + "grad_norm": 0.7406788086107634, + "learning_rate": 6.893897308370463e-07, + "loss": 1.3979, + "step": 1189 + }, + { + "epoch": 0.08291816186461345, + "grad_norm": 0.6537157668650204, + "learning_rate": 6.893719770356951e-07, + "loss": 1.431, + "step": 1190 + }, + { + "epoch": 0.08298784099223078, + "grad_norm": 0.7073784965936744, + "learning_rate": 6.893542086482141e-07, + "loss": 1.3773, + "step": 1191 + }, + { + "epoch": 0.0830575201198481, + "grad_norm": 26.82777241523961, + "learning_rate": 6.893364256754546e-07, + "loss": 1.5639, + "step": 1192 + }, + { + "epoch": 0.08312719924746542, + "grad_norm": 0.6623471303915018, + "learning_rate": 6.893186281182687e-07, + "loss": 1.5554, + "step": 1193 + }, + { + "epoch": 0.08319687837508274, + "grad_norm": 0.6809260214860811, + "learning_rate": 6.893008159775095e-07, + "loss": 1.4711, + "step": 1194 + }, + { + "epoch": 0.08326655750270007, + "grad_norm": 0.7157109529885805, + "learning_rate": 6.892829892540304e-07, + "loss": 1.4567, + "step": 1195 + }, + { + "epoch": 0.08333623663031739, + "grad_norm": 0.7103122232716318, + "learning_rate": 6.892651479486856e-07, + "loss": 1.5585, + "step": 1196 + }, + { + "epoch": 0.08340591575793471, + "grad_norm": 0.7422514010124394, + "learning_rate": 6.892472920623305e-07, + "loss": 1.5075, + "step": 1197 + }, + { + "epoch": 0.08347559488555203, + "grad_norm": 0.6853896896273873, + "learning_rate": 6.892294215958206e-07, + "loss": 1.4262, + "step": 1198 + }, + { + "epoch": 0.08354527401316936, + "grad_norm": 0.706936413282032, + "learning_rate": 6.892115365500121e-07, + "loss": 1.5037, + "step": 1199 + }, + { + "epoch": 0.08361495314078668, + "grad_norm": 0.6749722628125229, + "learning_rate": 6.891936369257622e-07, + "loss": 1.668, + "step": 1200 + }, + { + "epoch": 0.083684632268404, + "grad_norm": 0.7119207196263206, + "learning_rate": 6.891757227239289e-07, + "loss": 1.6217, + "step": 1201 + }, + { + "epoch": 0.08375431139602132, + "grad_norm": 0.7160388119086022, + "learning_rate": 6.891577939453703e-07, + "loss": 1.4864, + "step": 1202 + }, + { + "epoch": 0.08382399052363865, + "grad_norm": 0.719587381416626, + "learning_rate": 6.891398505909459e-07, + "loss": 1.6268, + "step": 1203 + }, + { + "epoch": 0.08389366965125597, + "grad_norm": 0.7042976963968398, + "learning_rate": 6.891218926615155e-07, + "loss": 1.5495, + "step": 1204 + }, + { + "epoch": 0.08396334877887329, + "grad_norm": 0.704887862807588, + "learning_rate": 6.891039201579396e-07, + "loss": 1.5302, + "step": 1205 + }, + { + "epoch": 0.08403302790649061, + "grad_norm": 0.6661954067466997, + "learning_rate": 6.890859330810796e-07, + "loss": 1.6247, + "step": 1206 + }, + { + "epoch": 0.08410270703410794, + "grad_norm": 0.6652903746555803, + "learning_rate": 6.890679314317973e-07, + "loss": 1.3775, + "step": 1207 + }, + { + "epoch": 0.08417238616172526, + "grad_norm": 0.6829149653109714, + "learning_rate": 6.890499152109555e-07, + "loss": 1.4859, + "step": 1208 + }, + { + "epoch": 0.08424206528934258, + "grad_norm": 0.7343751200022568, + "learning_rate": 6.890318844194177e-07, + "loss": 1.482, + "step": 1209 + }, + { + "epoch": 0.0843117444169599, + "grad_norm": 0.68418704479864, + "learning_rate": 6.890138390580477e-07, + "loss": 1.4719, + "step": 1210 + }, + { + "epoch": 0.08438142354457723, + "grad_norm": 0.7197352748202582, + "learning_rate": 6.889957791277104e-07, + "loss": 1.6337, + "step": 1211 + }, + { + "epoch": 0.08445110267219455, + "grad_norm": 0.6649129082292412, + "learning_rate": 6.889777046292714e-07, + "loss": 1.6359, + "step": 1212 + }, + { + "epoch": 0.08452078179981187, + "grad_norm": 0.7587559411692023, + "learning_rate": 6.889596155635966e-07, + "loss": 1.5549, + "step": 1213 + }, + { + "epoch": 0.0845904609274292, + "grad_norm": 0.6806377337801913, + "learning_rate": 6.889415119315531e-07, + "loss": 1.496, + "step": 1214 + }, + { + "epoch": 0.08466014005504652, + "grad_norm": 0.776941258193978, + "learning_rate": 6.889233937340084e-07, + "loss": 1.5431, + "step": 1215 + }, + { + "epoch": 0.08472981918266383, + "grad_norm": 0.6941217236555354, + "learning_rate": 6.889052609718306e-07, + "loss": 1.5759, + "step": 1216 + }, + { + "epoch": 0.08479949831028115, + "grad_norm": 0.7495506413349972, + "learning_rate": 6.888871136458888e-07, + "loss": 1.476, + "step": 1217 + }, + { + "epoch": 0.08486917743789847, + "grad_norm": 0.7426227429066157, + "learning_rate": 6.888689517570526e-07, + "loss": 1.4853, + "step": 1218 + }, + { + "epoch": 0.0849388565655158, + "grad_norm": 0.7340607548502726, + "learning_rate": 6.888507753061925e-07, + "loss": 1.5197, + "step": 1219 + }, + { + "epoch": 0.08500853569313312, + "grad_norm": 0.7516432731396169, + "learning_rate": 6.888325842941795e-07, + "loss": 1.5661, + "step": 1220 + }, + { + "epoch": 0.08507821482075044, + "grad_norm": 0.690211007495237, + "learning_rate": 6.888143787218852e-07, + "loss": 1.4689, + "step": 1221 + }, + { + "epoch": 0.08514789394836776, + "grad_norm": 0.7315103283612352, + "learning_rate": 6.887961585901822e-07, + "loss": 1.5662, + "step": 1222 + }, + { + "epoch": 0.08521757307598508, + "grad_norm": 0.7556938120563503, + "learning_rate": 6.887779238999436e-07, + "loss": 1.4909, + "step": 1223 + }, + { + "epoch": 0.0852872522036024, + "grad_norm": 0.7979406139485317, + "learning_rate": 6.887596746520433e-07, + "loss": 1.6, + "step": 1224 + }, + { + "epoch": 0.08535693133121973, + "grad_norm": 0.7772957970517779, + "learning_rate": 6.887414108473558e-07, + "loss": 1.6837, + "step": 1225 + }, + { + "epoch": 0.08542661045883705, + "grad_norm": 0.7963373152044353, + "learning_rate": 6.887231324867562e-07, + "loss": 1.6797, + "step": 1226 + }, + { + "epoch": 0.08549628958645437, + "grad_norm": 0.6920973186236006, + "learning_rate": 6.887048395711207e-07, + "loss": 1.4087, + "step": 1227 + }, + { + "epoch": 0.0855659687140717, + "grad_norm": 0.6683888822025812, + "learning_rate": 6.886865321013255e-07, + "loss": 1.4964, + "step": 1228 + }, + { + "epoch": 0.08563564784168902, + "grad_norm": 0.7450329584753508, + "learning_rate": 6.886682100782485e-07, + "loss": 1.5268, + "step": 1229 + }, + { + "epoch": 0.08570532696930634, + "grad_norm": 0.6858244481120948, + "learning_rate": 6.886498735027673e-07, + "loss": 1.5805, + "step": 1230 + }, + { + "epoch": 0.08577500609692366, + "grad_norm": 0.7184719502519555, + "learning_rate": 6.88631522375761e-07, + "loss": 1.528, + "step": 1231 + }, + { + "epoch": 0.08584468522454099, + "grad_norm": 0.7760134696891086, + "learning_rate": 6.886131566981086e-07, + "loss": 1.5172, + "step": 1232 + }, + { + "epoch": 0.08591436435215831, + "grad_norm": 0.7105109119867613, + "learning_rate": 6.885947764706906e-07, + "loss": 1.532, + "step": 1233 + }, + { + "epoch": 0.08598404347977563, + "grad_norm": 0.7015865098791763, + "learning_rate": 6.885763816943875e-07, + "loss": 1.4887, + "step": 1234 + }, + { + "epoch": 0.08605372260739295, + "grad_norm": 0.6983077687068207, + "learning_rate": 6.885579723700809e-07, + "loss": 1.5157, + "step": 1235 + }, + { + "epoch": 0.08612340173501028, + "grad_norm": 0.7334632983600148, + "learning_rate": 6.885395484986532e-07, + "loss": 1.5426, + "step": 1236 + }, + { + "epoch": 0.0861930808626276, + "grad_norm": 0.693820895798714, + "learning_rate": 6.885211100809872e-07, + "loss": 1.4726, + "step": 1237 + }, + { + "epoch": 0.08626275999024492, + "grad_norm": 0.6706017501517098, + "learning_rate": 6.885026571179664e-07, + "loss": 1.553, + "step": 1238 + }, + { + "epoch": 0.08633243911786224, + "grad_norm": 0.6712469046406953, + "learning_rate": 6.884841896104753e-07, + "loss": 1.5283, + "step": 1239 + }, + { + "epoch": 0.08640211824547957, + "grad_norm": 0.7396947117151503, + "learning_rate": 6.884657075593987e-07, + "loss": 1.4582, + "step": 1240 + }, + { + "epoch": 0.08647179737309689, + "grad_norm": 0.7470920745785362, + "learning_rate": 6.884472109656224e-07, + "loss": 1.5739, + "step": 1241 + }, + { + "epoch": 0.08654147650071421, + "grad_norm": 5.5737115931864585, + "learning_rate": 6.884286998300328e-07, + "loss": 1.5824, + "step": 1242 + }, + { + "epoch": 0.08661115562833153, + "grad_norm": 0.6947509528605078, + "learning_rate": 6.88410174153517e-07, + "loss": 1.5973, + "step": 1243 + }, + { + "epoch": 0.08668083475594886, + "grad_norm": 0.7216219247735955, + "learning_rate": 6.883916339369627e-07, + "loss": 1.5319, + "step": 1244 + }, + { + "epoch": 0.08675051388356618, + "grad_norm": 0.7350172934004601, + "learning_rate": 6.883730791812585e-07, + "loss": 1.5269, + "step": 1245 + }, + { + "epoch": 0.0868201930111835, + "grad_norm": 0.6955094801968497, + "learning_rate": 6.883545098872935e-07, + "loss": 1.6165, + "step": 1246 + }, + { + "epoch": 0.08688987213880082, + "grad_norm": 0.7538227602808574, + "learning_rate": 6.883359260559576e-07, + "loss": 1.4654, + "step": 1247 + }, + { + "epoch": 0.08695955126641815, + "grad_norm": 0.789486381100046, + "learning_rate": 6.883173276881414e-07, + "loss": 1.5613, + "step": 1248 + }, + { + "epoch": 0.08702923039403547, + "grad_norm": 0.6975860457622158, + "learning_rate": 6.882987147847363e-07, + "loss": 1.5203, + "step": 1249 + }, + { + "epoch": 0.08709890952165279, + "grad_norm": 0.7033421302463916, + "learning_rate": 6.88280087346634e-07, + "loss": 1.5222, + "step": 1250 + }, + { + "epoch": 0.08716858864927012, + "grad_norm": 0.7153412252377012, + "learning_rate": 6.882614453747272e-07, + "loss": 1.4739, + "step": 1251 + }, + { + "epoch": 0.08723826777688744, + "grad_norm": 0.694186348996605, + "learning_rate": 6.882427888699094e-07, + "loss": 1.5958, + "step": 1252 + }, + { + "epoch": 0.08730794690450476, + "grad_norm": 0.6852337145570081, + "learning_rate": 6.882241178330748e-07, + "loss": 1.545, + "step": 1253 + }, + { + "epoch": 0.08737762603212208, + "grad_norm": 0.7840072464679235, + "learning_rate": 6.882054322651178e-07, + "loss": 1.5576, + "step": 1254 + }, + { + "epoch": 0.0874473051597394, + "grad_norm": 0.7009252831810354, + "learning_rate": 6.88186732166934e-07, + "loss": 1.5604, + "step": 1255 + }, + { + "epoch": 0.08751698428735673, + "grad_norm": 0.7162091483897209, + "learning_rate": 6.881680175394195e-07, + "loss": 1.4449, + "step": 1256 + }, + { + "epoch": 0.08758666341497405, + "grad_norm": 0.6789452720548425, + "learning_rate": 6.881492883834714e-07, + "loss": 1.5185, + "step": 1257 + }, + { + "epoch": 0.08765634254259137, + "grad_norm": 0.75702264090507, + "learning_rate": 6.881305446999869e-07, + "loss": 1.5161, + "step": 1258 + }, + { + "epoch": 0.0877260216702087, + "grad_norm": 0.6837204332937512, + "learning_rate": 6.881117864898646e-07, + "loss": 1.5034, + "step": 1259 + }, + { + "epoch": 0.08779570079782602, + "grad_norm": 0.70633577212723, + "learning_rate": 6.88093013754003e-07, + "loss": 1.4734, + "step": 1260 + }, + { + "epoch": 0.08786537992544333, + "grad_norm": 0.7236570341818808, + "learning_rate": 6.880742264933021e-07, + "loss": 1.5184, + "step": 1261 + }, + { + "epoch": 0.08793505905306065, + "grad_norm": 0.7415268607505546, + "learning_rate": 6.880554247086619e-07, + "loss": 1.6711, + "step": 1262 + }, + { + "epoch": 0.08800473818067797, + "grad_norm": 0.6770620482470922, + "learning_rate": 6.880366084009836e-07, + "loss": 1.5219, + "step": 1263 + }, + { + "epoch": 0.0880744173082953, + "grad_norm": 0.7006824285806458, + "learning_rate": 6.880177775711691e-07, + "loss": 1.4605, + "step": 1264 + }, + { + "epoch": 0.08814409643591262, + "grad_norm": 0.6941678012435419, + "learning_rate": 6.879989322201204e-07, + "loss": 1.5493, + "step": 1265 + }, + { + "epoch": 0.08821377556352994, + "grad_norm": 0.6932515587732107, + "learning_rate": 6.879800723487409e-07, + "loss": 1.596, + "step": 1266 + }, + { + "epoch": 0.08828345469114726, + "grad_norm": 0.727639027077372, + "learning_rate": 6.879611979579343e-07, + "loss": 1.6517, + "step": 1267 + }, + { + "epoch": 0.08835313381876458, + "grad_norm": 0.7187124776898827, + "learning_rate": 6.879423090486051e-07, + "loss": 1.4588, + "step": 1268 + }, + { + "epoch": 0.0884228129463819, + "grad_norm": 0.714150308857557, + "learning_rate": 6.879234056216587e-07, + "loss": 1.5832, + "step": 1269 + }, + { + "epoch": 0.08849249207399923, + "grad_norm": 0.7638975002556709, + "learning_rate": 6.879044876780006e-07, + "loss": 1.6047, + "step": 1270 + }, + { + "epoch": 0.08856217120161655, + "grad_norm": 0.7434466148742814, + "learning_rate": 6.878855552185377e-07, + "loss": 1.6107, + "step": 1271 + }, + { + "epoch": 0.08863185032923387, + "grad_norm": 0.7399244720941883, + "learning_rate": 6.878666082441772e-07, + "loss": 1.4261, + "step": 1272 + }, + { + "epoch": 0.0887015294568512, + "grad_norm": 0.6842006315513699, + "learning_rate": 6.87847646755827e-07, + "loss": 1.4954, + "step": 1273 + }, + { + "epoch": 0.08877120858446852, + "grad_norm": 0.7245096057643684, + "learning_rate": 6.878286707543958e-07, + "loss": 1.5579, + "step": 1274 + }, + { + "epoch": 0.08884088771208584, + "grad_norm": 0.7879892971492194, + "learning_rate": 6.878096802407931e-07, + "loss": 1.5088, + "step": 1275 + }, + { + "epoch": 0.08891056683970316, + "grad_norm": 0.6868473383580004, + "learning_rate": 6.877906752159289e-07, + "loss": 1.3988, + "step": 1276 + }, + { + "epoch": 0.08898024596732049, + "grad_norm": 0.7437043498439738, + "learning_rate": 6.877716556807138e-07, + "loss": 1.6203, + "step": 1277 + }, + { + "epoch": 0.08904992509493781, + "grad_norm": 0.7058403378765666, + "learning_rate": 6.877526216360595e-07, + "loss": 1.5629, + "step": 1278 + }, + { + "epoch": 0.08911960422255513, + "grad_norm": 0.7342911778185349, + "learning_rate": 6.87733573082878e-07, + "loss": 1.617, + "step": 1279 + }, + { + "epoch": 0.08918928335017245, + "grad_norm": 0.7737218614982889, + "learning_rate": 6.877145100220821e-07, + "loss": 1.703, + "step": 1280 + }, + { + "epoch": 0.08925896247778978, + "grad_norm": 0.7115474712008767, + "learning_rate": 6.876954324545854e-07, + "loss": 1.5477, + "step": 1281 + }, + { + "epoch": 0.0893286416054071, + "grad_norm": 0.8068670560532931, + "learning_rate": 6.876763403813022e-07, + "loss": 1.6857, + "step": 1282 + }, + { + "epoch": 0.08939832073302442, + "grad_norm": 0.7093087007583438, + "learning_rate": 6.876572338031475e-07, + "loss": 1.4789, + "step": 1283 + }, + { + "epoch": 0.08946799986064174, + "grad_norm": 0.7420277841762087, + "learning_rate": 6.876381127210368e-07, + "loss": 1.4069, + "step": 1284 + }, + { + "epoch": 0.08953767898825907, + "grad_norm": 1.9990965003575498, + "learning_rate": 6.876189771358862e-07, + "loss": 1.4755, + "step": 1285 + }, + { + "epoch": 0.08960735811587639, + "grad_norm": 0.6977842602151991, + "learning_rate": 6.875998270486131e-07, + "loss": 1.6219, + "step": 1286 + }, + { + "epoch": 0.08967703724349371, + "grad_norm": 0.6590490730475049, + "learning_rate": 6.875806624601351e-07, + "loss": 1.5421, + "step": 1287 + }, + { + "epoch": 0.08974671637111103, + "grad_norm": 0.6887642541353808, + "learning_rate": 6.875614833713706e-07, + "loss": 1.4732, + "step": 1288 + }, + { + "epoch": 0.08981639549872836, + "grad_norm": 0.7376671506965188, + "learning_rate": 6.875422897832385e-07, + "loss": 1.3695, + "step": 1289 + }, + { + "epoch": 0.08988607462634568, + "grad_norm": 0.7259016277612667, + "learning_rate": 6.875230816966589e-07, + "loss": 1.5346, + "step": 1290 + }, + { + "epoch": 0.089955753753963, + "grad_norm": 0.724086003625605, + "learning_rate": 6.875038591125522e-07, + "loss": 1.6004, + "step": 1291 + }, + { + "epoch": 0.09002543288158033, + "grad_norm": 0.7178196848691761, + "learning_rate": 6.874846220318393e-07, + "loss": 1.6231, + "step": 1292 + }, + { + "epoch": 0.09009511200919765, + "grad_norm": 0.7373207554001411, + "learning_rate": 6.874653704554426e-07, + "loss": 1.4717, + "step": 1293 + }, + { + "epoch": 0.09016479113681497, + "grad_norm": 0.7134230514137152, + "learning_rate": 6.874461043842843e-07, + "loss": 1.6591, + "step": 1294 + }, + { + "epoch": 0.09023447026443229, + "grad_norm": 0.7376105614130277, + "learning_rate": 6.874268238192877e-07, + "loss": 1.6336, + "step": 1295 + }, + { + "epoch": 0.09030414939204962, + "grad_norm": 0.7450088115019615, + "learning_rate": 6.874075287613769e-07, + "loss": 1.6974, + "step": 1296 + }, + { + "epoch": 0.09037382851966694, + "grad_norm": 0.6893786584174302, + "learning_rate": 6.873882192114765e-07, + "loss": 1.5151, + "step": 1297 + }, + { + "epoch": 0.09044350764728426, + "grad_norm": 0.6715073587311645, + "learning_rate": 6.873688951705119e-07, + "loss": 1.4657, + "step": 1298 + }, + { + "epoch": 0.09051318677490158, + "grad_norm": 0.7566996663979243, + "learning_rate": 6.873495566394089e-07, + "loss": 1.5942, + "step": 1299 + }, + { + "epoch": 0.0905828659025189, + "grad_norm": 0.7357953944743499, + "learning_rate": 6.873302036190946e-07, + "loss": 1.6023, + "step": 1300 + }, + { + "epoch": 0.09065254503013623, + "grad_norm": 0.7581630163955304, + "learning_rate": 6.873108361104963e-07, + "loss": 1.7109, + "step": 1301 + }, + { + "epoch": 0.09072222415775355, + "grad_norm": 0.7341133536278991, + "learning_rate": 6.872914541145419e-07, + "loss": 1.6366, + "step": 1302 + }, + { + "epoch": 0.09079190328537087, + "grad_norm": 0.6626232009933386, + "learning_rate": 6.872720576321606e-07, + "loss": 1.5272, + "step": 1303 + }, + { + "epoch": 0.0908615824129882, + "grad_norm": 0.7910328700731327, + "learning_rate": 6.872526466642818e-07, + "loss": 1.7656, + "step": 1304 + }, + { + "epoch": 0.09093126154060552, + "grad_norm": 0.7662975077969607, + "learning_rate": 6.872332212118355e-07, + "loss": 1.635, + "step": 1305 + }, + { + "epoch": 0.09100094066822284, + "grad_norm": 0.6887721459909045, + "learning_rate": 6.872137812757528e-07, + "loss": 1.523, + "step": 1306 + }, + { + "epoch": 0.09107061979584015, + "grad_norm": 0.712920618786257, + "learning_rate": 6.871943268569653e-07, + "loss": 1.6317, + "step": 1307 + }, + { + "epoch": 0.09114029892345747, + "grad_norm": 0.7577558050195798, + "learning_rate": 6.871748579564053e-07, + "loss": 1.5641, + "step": 1308 + }, + { + "epoch": 0.0912099780510748, + "grad_norm": 0.6982117743762234, + "learning_rate": 6.871553745750056e-07, + "loss": 1.4785, + "step": 1309 + }, + { + "epoch": 0.09127965717869212, + "grad_norm": 0.6991004630390641, + "learning_rate": 6.871358767137003e-07, + "loss": 1.5554, + "step": 1310 + }, + { + "epoch": 0.09134933630630944, + "grad_norm": 0.7314176603239776, + "learning_rate": 6.871163643734233e-07, + "loss": 1.6083, + "step": 1311 + }, + { + "epoch": 0.09141901543392676, + "grad_norm": 0.6877124905657467, + "learning_rate": 6.870968375551098e-07, + "loss": 1.5961, + "step": 1312 + }, + { + "epoch": 0.09148869456154408, + "grad_norm": 0.6395093045596821, + "learning_rate": 6.870772962596959e-07, + "loss": 1.5792, + "step": 1313 + }, + { + "epoch": 0.09155837368916141, + "grad_norm": 0.7082289797343458, + "learning_rate": 6.870577404881177e-07, + "loss": 1.5305, + "step": 1314 + }, + { + "epoch": 0.09162805281677873, + "grad_norm": 0.7492565372235473, + "learning_rate": 6.870381702413124e-07, + "loss": 1.6021, + "step": 1315 + }, + { + "epoch": 0.09169773194439605, + "grad_norm": 0.7328857828061993, + "learning_rate": 6.87018585520218e-07, + "loss": 1.5604, + "step": 1316 + }, + { + "epoch": 0.09176741107201337, + "grad_norm": 0.9248051912076805, + "learning_rate": 6.86998986325773e-07, + "loss": 1.5056, + "step": 1317 + }, + { + "epoch": 0.0918370901996307, + "grad_norm": 0.7133062641617752, + "learning_rate": 6.869793726589165e-07, + "loss": 1.5892, + "step": 1318 + }, + { + "epoch": 0.09190676932724802, + "grad_norm": 0.7847077630418262, + "learning_rate": 6.869597445205885e-07, + "loss": 1.5176, + "step": 1319 + }, + { + "epoch": 0.09197644845486534, + "grad_norm": 0.6839226291668064, + "learning_rate": 6.869401019117297e-07, + "loss": 1.4888, + "step": 1320 + }, + { + "epoch": 0.09204612758248266, + "grad_norm": 0.721028875233305, + "learning_rate": 6.869204448332812e-07, + "loss": 1.5445, + "step": 1321 + }, + { + "epoch": 0.09211580671009999, + "grad_norm": 0.7977990236065278, + "learning_rate": 6.869007732861853e-07, + "loss": 1.6349, + "step": 1322 + }, + { + "epoch": 0.09218548583771731, + "grad_norm": 0.726214409816211, + "learning_rate": 6.868810872713846e-07, + "loss": 1.5163, + "step": 1323 + }, + { + "epoch": 0.09225516496533463, + "grad_norm": 0.7781168458751725, + "learning_rate": 6.868613867898223e-07, + "loss": 1.585, + "step": 1324 + }, + { + "epoch": 0.09232484409295195, + "grad_norm": 0.718082280962953, + "learning_rate": 6.868416718424427e-07, + "loss": 1.5275, + "step": 1325 + }, + { + "epoch": 0.09239452322056928, + "grad_norm": 0.752359718631759, + "learning_rate": 6.868219424301905e-07, + "loss": 1.7251, + "step": 1326 + }, + { + "epoch": 0.0924642023481866, + "grad_norm": 0.7757472708282024, + "learning_rate": 6.868021985540112e-07, + "loss": 1.5877, + "step": 1327 + }, + { + "epoch": 0.09253388147580392, + "grad_norm": 0.7188829520974873, + "learning_rate": 6.867824402148509e-07, + "loss": 1.6991, + "step": 1328 + }, + { + "epoch": 0.09260356060342124, + "grad_norm": 0.6927468915057636, + "learning_rate": 6.867626674136566e-07, + "loss": 1.4599, + "step": 1329 + }, + { + "epoch": 0.09267323973103857, + "grad_norm": 0.7793706193864617, + "learning_rate": 6.867428801513757e-07, + "loss": 1.8769, + "step": 1330 + }, + { + "epoch": 0.09274291885865589, + "grad_norm": 0.7050947771615956, + "learning_rate": 6.867230784289566e-07, + "loss": 1.5947, + "step": 1331 + }, + { + "epoch": 0.09281259798627321, + "grad_norm": 0.7363007364951878, + "learning_rate": 6.867032622473481e-07, + "loss": 1.4746, + "step": 1332 + }, + { + "epoch": 0.09288227711389054, + "grad_norm": 0.7210920566358469, + "learning_rate": 6.866834316074999e-07, + "loss": 1.4997, + "step": 1333 + }, + { + "epoch": 0.09295195624150786, + "grad_norm": 0.694334164410711, + "learning_rate": 6.866635865103623e-07, + "loss": 1.5192, + "step": 1334 + }, + { + "epoch": 0.09302163536912518, + "grad_norm": 0.6826381175796876, + "learning_rate": 6.866437269568864e-07, + "loss": 1.5416, + "step": 1335 + }, + { + "epoch": 0.0930913144967425, + "grad_norm": 0.7094518106250833, + "learning_rate": 6.866238529480238e-07, + "loss": 1.6305, + "step": 1336 + }, + { + "epoch": 0.09316099362435983, + "grad_norm": 0.7451153132227982, + "learning_rate": 6.86603964484727e-07, + "loss": 1.6427, + "step": 1337 + }, + { + "epoch": 0.09323067275197715, + "grad_norm": 0.6944135717548009, + "learning_rate": 6.865840615679489e-07, + "loss": 1.5994, + "step": 1338 + }, + { + "epoch": 0.09330035187959447, + "grad_norm": 0.7324834912449787, + "learning_rate": 6.865641441986436e-07, + "loss": 1.5387, + "step": 1339 + }, + { + "epoch": 0.09337003100721179, + "grad_norm": 0.6871232434674317, + "learning_rate": 6.865442123777652e-07, + "loss": 1.4626, + "step": 1340 + }, + { + "epoch": 0.09343971013482912, + "grad_norm": 0.75076436213832, + "learning_rate": 6.865242661062692e-07, + "loss": 1.4723, + "step": 1341 + }, + { + "epoch": 0.09350938926244644, + "grad_norm": 0.6986478206384793, + "learning_rate": 6.865043053851113e-07, + "loss": 1.546, + "step": 1342 + }, + { + "epoch": 0.09357906839006376, + "grad_norm": 0.6773347909600681, + "learning_rate": 6.864843302152483e-07, + "loss": 1.5233, + "step": 1343 + }, + { + "epoch": 0.09364874751768108, + "grad_norm": 0.7256916010451714, + "learning_rate": 6.864643405976371e-07, + "loss": 1.5341, + "step": 1344 + }, + { + "epoch": 0.0937184266452984, + "grad_norm": 0.7156842660159166, + "learning_rate": 6.864443365332357e-07, + "loss": 1.4524, + "step": 1345 + }, + { + "epoch": 0.09378810577291573, + "grad_norm": 0.717844637914586, + "learning_rate": 6.86424318023003e-07, + "loss": 1.5503, + "step": 1346 + }, + { + "epoch": 0.09385778490053305, + "grad_norm": 0.7134994406384746, + "learning_rate": 6.864042850678981e-07, + "loss": 1.5557, + "step": 1347 + }, + { + "epoch": 0.09392746402815037, + "grad_norm": 0.7060719200241936, + "learning_rate": 6.863842376688812e-07, + "loss": 1.6098, + "step": 1348 + }, + { + "epoch": 0.0939971431557677, + "grad_norm": 0.7133840642791894, + "learning_rate": 6.863641758269128e-07, + "loss": 1.6069, + "step": 1349 + }, + { + "epoch": 0.09406682228338502, + "grad_norm": 0.7419267085106452, + "learning_rate": 6.863440995429543e-07, + "loss": 1.5277, + "step": 1350 + }, + { + "epoch": 0.09413650141100234, + "grad_norm": 0.7320435951110261, + "learning_rate": 6.863240088179681e-07, + "loss": 1.5051, + "step": 1351 + }, + { + "epoch": 0.09420618053861966, + "grad_norm": 0.7189557968560032, + "learning_rate": 6.863039036529167e-07, + "loss": 1.5563, + "step": 1352 + }, + { + "epoch": 0.09427585966623697, + "grad_norm": 0.7258479005816894, + "learning_rate": 6.862837840487637e-07, + "loss": 1.7108, + "step": 1353 + }, + { + "epoch": 0.0943455387938543, + "grad_norm": 0.7093271738297767, + "learning_rate": 6.862636500064733e-07, + "loss": 1.561, + "step": 1354 + }, + { + "epoch": 0.09441521792147162, + "grad_norm": 0.7289165862673944, + "learning_rate": 6.862435015270102e-07, + "loss": 1.5443, + "step": 1355 + }, + { + "epoch": 0.09448489704908894, + "grad_norm": 0.728731968183907, + "learning_rate": 6.862233386113402e-07, + "loss": 1.4719, + "step": 1356 + }, + { + "epoch": 0.09455457617670626, + "grad_norm": 0.7598985533740218, + "learning_rate": 6.862031612604292e-07, + "loss": 1.5874, + "step": 1357 + }, + { + "epoch": 0.09462425530432358, + "grad_norm": 0.7489312592826469, + "learning_rate": 6.861829694752445e-07, + "loss": 1.5196, + "step": 1358 + }, + { + "epoch": 0.09469393443194091, + "grad_norm": 0.7115801847246932, + "learning_rate": 6.861627632567536e-07, + "loss": 1.4183, + "step": 1359 + }, + { + "epoch": 0.09476361355955823, + "grad_norm": 0.7286465327425591, + "learning_rate": 6.861425426059247e-07, + "loss": 1.5136, + "step": 1360 + }, + { + "epoch": 0.09483329268717555, + "grad_norm": 0.7282399514777784, + "learning_rate": 6.861223075237272e-07, + "loss": 1.4448, + "step": 1361 + }, + { + "epoch": 0.09490297181479287, + "grad_norm": 0.6552566952542627, + "learning_rate": 6.861020580111302e-07, + "loss": 1.5461, + "step": 1362 + }, + { + "epoch": 0.0949726509424102, + "grad_norm": 0.6940211142791631, + "learning_rate": 6.860817940691046e-07, + "loss": 1.4516, + "step": 1363 + }, + { + "epoch": 0.09504233007002752, + "grad_norm": 0.7633128378880454, + "learning_rate": 6.860615156986212e-07, + "loss": 1.6427, + "step": 1364 + }, + { + "epoch": 0.09511200919764484, + "grad_norm": 0.6982614518760314, + "learning_rate": 6.86041222900652e-07, + "loss": 1.4, + "step": 1365 + }, + { + "epoch": 0.09518168832526216, + "grad_norm": 0.7177285924240916, + "learning_rate": 6.860209156761693e-07, + "loss": 1.4115, + "step": 1366 + }, + { + "epoch": 0.09525136745287949, + "grad_norm": 0.6952426491388587, + "learning_rate": 6.860005940261464e-07, + "loss": 1.4289, + "step": 1367 + }, + { + "epoch": 0.09532104658049681, + "grad_norm": 0.6700157416887091, + "learning_rate": 6.859802579515571e-07, + "loss": 1.4947, + "step": 1368 + }, + { + "epoch": 0.09539072570811413, + "grad_norm": 0.7096025037508805, + "learning_rate": 6.859599074533759e-07, + "loss": 1.5841, + "step": 1369 + }, + { + "epoch": 0.09546040483573145, + "grad_norm": 0.6999665684061958, + "learning_rate": 6.85939542532578e-07, + "loss": 1.5309, + "step": 1370 + }, + { + "epoch": 0.09553008396334878, + "grad_norm": 0.7132517833298145, + "learning_rate": 6.859191631901395e-07, + "loss": 1.6011, + "step": 1371 + }, + { + "epoch": 0.0955997630909661, + "grad_norm": 0.6828272290576067, + "learning_rate": 6.858987694270371e-07, + "loss": 1.5264, + "step": 1372 + }, + { + "epoch": 0.09566944221858342, + "grad_norm": 0.7158171608458893, + "learning_rate": 6.858783612442477e-07, + "loss": 1.5748, + "step": 1373 + }, + { + "epoch": 0.09573912134620075, + "grad_norm": 0.7506877164228436, + "learning_rate": 6.858579386427496e-07, + "loss": 1.5836, + "step": 1374 + }, + { + "epoch": 0.09580880047381807, + "grad_norm": 0.7367438933731758, + "learning_rate": 6.858375016235214e-07, + "loss": 1.6368, + "step": 1375 + }, + { + "epoch": 0.09587847960143539, + "grad_norm": 0.7285757842868972, + "learning_rate": 6.858170501875426e-07, + "loss": 1.5309, + "step": 1376 + }, + { + "epoch": 0.09594815872905271, + "grad_norm": 0.73602276499145, + "learning_rate": 6.85796584335793e-07, + "loss": 1.5709, + "step": 1377 + }, + { + "epoch": 0.09601783785667004, + "grad_norm": 0.7170190034347591, + "learning_rate": 6.857761040692537e-07, + "loss": 1.5174, + "step": 1378 + }, + { + "epoch": 0.09608751698428736, + "grad_norm": 0.7080538784656087, + "learning_rate": 6.857556093889062e-07, + "loss": 1.5577, + "step": 1379 + }, + { + "epoch": 0.09615719611190468, + "grad_norm": 0.6842467145632883, + "learning_rate": 6.857351002957321e-07, + "loss": 1.5241, + "step": 1380 + }, + { + "epoch": 0.096226875239522, + "grad_norm": 0.7093581497996692, + "learning_rate": 6.857145767907148e-07, + "loss": 1.597, + "step": 1381 + }, + { + "epoch": 0.09629655436713933, + "grad_norm": 0.7003330075726656, + "learning_rate": 6.856940388748376e-07, + "loss": 1.5867, + "step": 1382 + }, + { + "epoch": 0.09636623349475665, + "grad_norm": 0.7464105256695647, + "learning_rate": 6.856734865490847e-07, + "loss": 1.6015, + "step": 1383 + }, + { + "epoch": 0.09643591262237397, + "grad_norm": 0.7417160276838659, + "learning_rate": 6.85652919814441e-07, + "loss": 1.564, + "step": 1384 + }, + { + "epoch": 0.09650559174999129, + "grad_norm": 0.7036341159599784, + "learning_rate": 6.856323386718923e-07, + "loss": 1.4837, + "step": 1385 + }, + { + "epoch": 0.09657527087760862, + "grad_norm": 0.7182081102121308, + "learning_rate": 6.856117431224246e-07, + "loss": 1.4602, + "step": 1386 + }, + { + "epoch": 0.09664495000522594, + "grad_norm": 0.7267870082254924, + "learning_rate": 6.855911331670251e-07, + "loss": 1.5879, + "step": 1387 + }, + { + "epoch": 0.09671462913284326, + "grad_norm": 0.7263497628374512, + "learning_rate": 6.855705088066814e-07, + "loss": 1.5322, + "step": 1388 + }, + { + "epoch": 0.09678430826046058, + "grad_norm": 0.7424830626660222, + "learning_rate": 6.855498700423819e-07, + "loss": 1.663, + "step": 1389 + }, + { + "epoch": 0.0968539873880779, + "grad_norm": 0.6792334902361514, + "learning_rate": 6.855292168751155e-07, + "loss": 1.6057, + "step": 1390 + }, + { + "epoch": 0.09692366651569523, + "grad_norm": 0.6902998981753529, + "learning_rate": 6.855085493058721e-07, + "loss": 1.5234, + "step": 1391 + }, + { + "epoch": 0.09699334564331255, + "grad_norm": 0.6695799650473492, + "learning_rate": 6.854878673356421e-07, + "loss": 1.6354, + "step": 1392 + }, + { + "epoch": 0.09706302477092987, + "grad_norm": 0.707158420145716, + "learning_rate": 6.854671709654168e-07, + "loss": 1.6303, + "step": 1393 + }, + { + "epoch": 0.0971327038985472, + "grad_norm": 0.6919860585571154, + "learning_rate": 6.854464601961875e-07, + "loss": 1.5524, + "step": 1394 + }, + { + "epoch": 0.09720238302616452, + "grad_norm": 0.7174186095393233, + "learning_rate": 6.854257350289472e-07, + "loss": 1.5092, + "step": 1395 + }, + { + "epoch": 0.09727206215378184, + "grad_norm": 0.7194978937889569, + "learning_rate": 6.854049954646889e-07, + "loss": 1.6163, + "step": 1396 + }, + { + "epoch": 0.09734174128139916, + "grad_norm": 0.7669208281476789, + "learning_rate": 6.853842415044065e-07, + "loss": 1.4796, + "step": 1397 + }, + { + "epoch": 0.09741142040901649, + "grad_norm": 0.7872523249133949, + "learning_rate": 6.853634731490944e-07, + "loss": 1.5655, + "step": 1398 + }, + { + "epoch": 0.0974810995366338, + "grad_norm": 0.72278764585173, + "learning_rate": 6.853426903997482e-07, + "loss": 1.5231, + "step": 1399 + }, + { + "epoch": 0.09755077866425112, + "grad_norm": 0.6911638249630152, + "learning_rate": 6.853218932573636e-07, + "loss": 1.4571, + "step": 1400 + }, + { + "epoch": 0.09762045779186844, + "grad_norm": 0.7409581114985209, + "learning_rate": 6.853010817229374e-07, + "loss": 1.5235, + "step": 1401 + }, + { + "epoch": 0.09769013691948576, + "grad_norm": 0.7512553391928395, + "learning_rate": 6.852802557974668e-07, + "loss": 1.6149, + "step": 1402 + }, + { + "epoch": 0.09775981604710308, + "grad_norm": 0.7542228208631525, + "learning_rate": 6.8525941548195e-07, + "loss": 1.6557, + "step": 1403 + }, + { + "epoch": 0.09782949517472041, + "grad_norm": 0.730933943838304, + "learning_rate": 6.852385607773855e-07, + "loss": 1.5203, + "step": 1404 + }, + { + "epoch": 0.09789917430233773, + "grad_norm": 0.7226965064395986, + "learning_rate": 6.852176916847728e-07, + "loss": 1.5156, + "step": 1405 + }, + { + "epoch": 0.09796885342995505, + "grad_norm": 0.7400386566230431, + "learning_rate": 6.851968082051119e-07, + "loss": 1.4812, + "step": 1406 + }, + { + "epoch": 0.09803853255757237, + "grad_norm": 0.7217069234943865, + "learning_rate": 6.851759103394038e-07, + "loss": 1.6011, + "step": 1407 + }, + { + "epoch": 0.0981082116851897, + "grad_norm": 0.7208248423892274, + "learning_rate": 6.851549980886498e-07, + "loss": 1.5482, + "step": 1408 + }, + { + "epoch": 0.09817789081280702, + "grad_norm": 0.73869785008963, + "learning_rate": 6.851340714538519e-07, + "loss": 1.5056, + "step": 1409 + }, + { + "epoch": 0.09824756994042434, + "grad_norm": 0.7205356992642813, + "learning_rate": 6.851131304360134e-07, + "loss": 1.6143, + "step": 1410 + }, + { + "epoch": 0.09831724906804166, + "grad_norm": 0.7504594927506455, + "learning_rate": 6.850921750361374e-07, + "loss": 1.5684, + "step": 1411 + }, + { + "epoch": 0.09838692819565899, + "grad_norm": 0.7912872734883598, + "learning_rate": 6.850712052552282e-07, + "loss": 1.5114, + "step": 1412 + }, + { + "epoch": 0.09845660732327631, + "grad_norm": 0.6954212545087036, + "learning_rate": 6.85050221094291e-07, + "loss": 1.5148, + "step": 1413 + }, + { + "epoch": 0.09852628645089363, + "grad_norm": 0.7121224389351605, + "learning_rate": 6.850292225543312e-07, + "loss": 1.5543, + "step": 1414 + }, + { + "epoch": 0.09859596557851096, + "grad_norm": 0.7308811982222069, + "learning_rate": 6.850082096363551e-07, + "loss": 1.5675, + "step": 1415 + }, + { + "epoch": 0.09866564470612828, + "grad_norm": 0.6669232499820077, + "learning_rate": 6.849871823413696e-07, + "loss": 1.5222, + "step": 1416 + }, + { + "epoch": 0.0987353238337456, + "grad_norm": 0.7023704892031762, + "learning_rate": 6.849661406703825e-07, + "loss": 1.4323, + "step": 1417 + }, + { + "epoch": 0.09880500296136292, + "grad_norm": 0.7450305359211221, + "learning_rate": 6.849450846244022e-07, + "loss": 1.6129, + "step": 1418 + }, + { + "epoch": 0.09887468208898025, + "grad_norm": 0.7559519975027, + "learning_rate": 6.849240142044376e-07, + "loss": 1.543, + "step": 1419 + }, + { + "epoch": 0.09894436121659757, + "grad_norm": 0.7557039933040223, + "learning_rate": 6.849029294114985e-07, + "loss": 1.7281, + "step": 1420 + }, + { + "epoch": 0.09901404034421489, + "grad_norm": 0.7535013429286772, + "learning_rate": 6.848818302465954e-07, + "loss": 1.5028, + "step": 1421 + }, + { + "epoch": 0.09908371947183221, + "grad_norm": 0.695752285288689, + "learning_rate": 6.848607167107393e-07, + "loss": 1.5552, + "step": 1422 + }, + { + "epoch": 0.09915339859944954, + "grad_norm": 0.7130712608592883, + "learning_rate": 6.848395888049421e-07, + "loss": 1.5192, + "step": 1423 + }, + { + "epoch": 0.09922307772706686, + "grad_norm": 0.7462396757316636, + "learning_rate": 6.84818446530216e-07, + "loss": 1.6576, + "step": 1424 + }, + { + "epoch": 0.09929275685468418, + "grad_norm": 0.6936681758464512, + "learning_rate": 6.847972898875747e-07, + "loss": 1.4941, + "step": 1425 + }, + { + "epoch": 0.0993624359823015, + "grad_norm": 0.7291394697109261, + "learning_rate": 6.847761188780318e-07, + "loss": 1.5412, + "step": 1426 + }, + { + "epoch": 0.09943211510991883, + "grad_norm": 0.8215857318532434, + "learning_rate": 6.847549335026017e-07, + "loss": 1.6619, + "step": 1427 + }, + { + "epoch": 0.09950179423753615, + "grad_norm": 0.7589146596740709, + "learning_rate": 6.847337337623e-07, + "loss": 1.5622, + "step": 1428 + }, + { + "epoch": 0.09957147336515347, + "grad_norm": 0.7000258820654407, + "learning_rate": 6.847125196581422e-07, + "loss": 1.6265, + "step": 1429 + }, + { + "epoch": 0.0996411524927708, + "grad_norm": 0.6653107210684309, + "learning_rate": 6.846912911911453e-07, + "loss": 1.4856, + "step": 1430 + }, + { + "epoch": 0.09971083162038812, + "grad_norm": 0.6702010457721812, + "learning_rate": 6.846700483623265e-07, + "loss": 1.4824, + "step": 1431 + }, + { + "epoch": 0.09978051074800544, + "grad_norm": 0.7175711439850605, + "learning_rate": 6.846487911727036e-07, + "loss": 1.5737, + "step": 1432 + }, + { + "epoch": 0.09985018987562276, + "grad_norm": 0.729863487630728, + "learning_rate": 6.846275196232956e-07, + "loss": 1.4471, + "step": 1433 + }, + { + "epoch": 0.09991986900324008, + "grad_norm": 0.6618160691322408, + "learning_rate": 6.846062337151217e-07, + "loss": 1.4689, + "step": 1434 + }, + { + "epoch": 0.0999895481308574, + "grad_norm": 0.761821100047957, + "learning_rate": 6.845849334492021e-07, + "loss": 1.6445, + "step": 1435 + }, + { + "epoch": 0.10005922725847473, + "grad_norm": 0.7375583392439995, + "learning_rate": 6.845636188265573e-07, + "loss": 1.5686, + "step": 1436 + }, + { + "epoch": 0.10012890638609205, + "grad_norm": 0.7082128443723841, + "learning_rate": 6.845422898482089e-07, + "loss": 1.4708, + "step": 1437 + }, + { + "epoch": 0.10019858551370937, + "grad_norm": 0.7831935456199045, + "learning_rate": 6.845209465151791e-07, + "loss": 1.5965, + "step": 1438 + }, + { + "epoch": 0.1002682646413267, + "grad_norm": 0.6625792127601626, + "learning_rate": 6.844995888284906e-07, + "loss": 1.4301, + "step": 1439 + }, + { + "epoch": 0.10033794376894402, + "grad_norm": 0.7224274069117864, + "learning_rate": 6.84478216789167e-07, + "loss": 1.52, + "step": 1440 + }, + { + "epoch": 0.10040762289656134, + "grad_norm": 0.7306401875419833, + "learning_rate": 6.844568303982324e-07, + "loss": 1.6456, + "step": 1441 + }, + { + "epoch": 0.10047730202417866, + "grad_norm": 0.6991325586636551, + "learning_rate": 6.844354296567117e-07, + "loss": 1.5894, + "step": 1442 + }, + { + "epoch": 0.10054698115179599, + "grad_norm": 0.8232785004103219, + "learning_rate": 6.844140145656305e-07, + "loss": 1.7543, + "step": 1443 + }, + { + "epoch": 0.10061666027941331, + "grad_norm": 0.7215617175038418, + "learning_rate": 6.84392585126015e-07, + "loss": 1.5773, + "step": 1444 + }, + { + "epoch": 0.10068633940703062, + "grad_norm": 0.7374985957692638, + "learning_rate": 6.843711413388923e-07, + "loss": 1.5536, + "step": 1445 + }, + { + "epoch": 0.10075601853464794, + "grad_norm": 0.7104503086346701, + "learning_rate": 6.843496832052897e-07, + "loss": 1.5673, + "step": 1446 + }, + { + "epoch": 0.10082569766226526, + "grad_norm": 0.7036943520124541, + "learning_rate": 6.843282107262359e-07, + "loss": 1.4641, + "step": 1447 + }, + { + "epoch": 0.10089537678988258, + "grad_norm": 0.7682117601321635, + "learning_rate": 6.843067239027598e-07, + "loss": 1.4993, + "step": 1448 + }, + { + "epoch": 0.10096505591749991, + "grad_norm": 0.7987108957657223, + "learning_rate": 6.842852227358907e-07, + "loss": 1.6263, + "step": 1449 + }, + { + "epoch": 0.10103473504511723, + "grad_norm": 0.7182857850830695, + "learning_rate": 6.842637072266596e-07, + "loss": 1.53, + "step": 1450 + }, + { + "epoch": 0.10110441417273455, + "grad_norm": 0.7246888294850192, + "learning_rate": 6.842421773760972e-07, + "loss": 1.7401, + "step": 1451 + }, + { + "epoch": 0.10117409330035187, + "grad_norm": 0.7094639682875344, + "learning_rate": 6.842206331852352e-07, + "loss": 1.5597, + "step": 1452 + }, + { + "epoch": 0.1012437724279692, + "grad_norm": 0.744605882539227, + "learning_rate": 6.841990746551064e-07, + "loss": 1.6996, + "step": 1453 + }, + { + "epoch": 0.10131345155558652, + "grad_norm": 0.7732575296825245, + "learning_rate": 6.841775017867435e-07, + "loss": 1.6414, + "step": 1454 + }, + { + "epoch": 0.10138313068320384, + "grad_norm": 0.7219152072214178, + "learning_rate": 6.841559145811805e-07, + "loss": 1.6682, + "step": 1455 + }, + { + "epoch": 0.10145280981082117, + "grad_norm": 0.7089114420009685, + "learning_rate": 6.84134313039452e-07, + "loss": 1.6003, + "step": 1456 + }, + { + "epoch": 0.10152248893843849, + "grad_norm": 0.6822564437101662, + "learning_rate": 6.841126971625932e-07, + "loss": 1.5532, + "step": 1457 + }, + { + "epoch": 0.10159216806605581, + "grad_norm": 0.6908697206870166, + "learning_rate": 6.840910669516399e-07, + "loss": 1.551, + "step": 1458 + }, + { + "epoch": 0.10166184719367313, + "grad_norm": 0.749847037637897, + "learning_rate": 6.840694224076284e-07, + "loss": 1.511, + "step": 1459 + }, + { + "epoch": 0.10173152632129046, + "grad_norm": 0.6973808625903857, + "learning_rate": 6.840477635315965e-07, + "loss": 1.6447, + "step": 1460 + }, + { + "epoch": 0.10180120544890778, + "grad_norm": 0.6637049891036413, + "learning_rate": 6.840260903245816e-07, + "loss": 1.4964, + "step": 1461 + }, + { + "epoch": 0.1018708845765251, + "grad_norm": 0.7051328218519948, + "learning_rate": 6.840044027876226e-07, + "loss": 1.4516, + "step": 1462 + }, + { + "epoch": 0.10194056370414242, + "grad_norm": 0.6998029876564847, + "learning_rate": 6.839827009217589e-07, + "loss": 1.7098, + "step": 1463 + }, + { + "epoch": 0.10201024283175975, + "grad_norm": 0.8312269910488456, + "learning_rate": 6.839609847280303e-07, + "loss": 1.4785, + "step": 1464 + }, + { + "epoch": 0.10207992195937707, + "grad_norm": 0.692043398562744, + "learning_rate": 6.839392542074777e-07, + "loss": 1.3329, + "step": 1465 + }, + { + "epoch": 0.10214960108699439, + "grad_norm": 0.6905454250734092, + "learning_rate": 6.83917509361142e-07, + "loss": 1.6113, + "step": 1466 + }, + { + "epoch": 0.10221928021461171, + "grad_norm": 0.6692689047744544, + "learning_rate": 6.838957501900658e-07, + "loss": 1.5585, + "step": 1467 + }, + { + "epoch": 0.10228895934222904, + "grad_norm": 0.6866297965522439, + "learning_rate": 6.838739766952916e-07, + "loss": 1.491, + "step": 1468 + }, + { + "epoch": 0.10235863846984636, + "grad_norm": 0.713111200601478, + "learning_rate": 6.838521888778629e-07, + "loss": 1.4458, + "step": 1469 + }, + { + "epoch": 0.10242831759746368, + "grad_norm": 0.7596916282973877, + "learning_rate": 6.838303867388237e-07, + "loss": 1.5562, + "step": 1470 + }, + { + "epoch": 0.102497996725081, + "grad_norm": 0.7729511940587167, + "learning_rate": 6.83808570279219e-07, + "loss": 1.4941, + "step": 1471 + }, + { + "epoch": 0.10256767585269833, + "grad_norm": 0.6728813668527486, + "learning_rate": 6.83786739500094e-07, + "loss": 1.3944, + "step": 1472 + }, + { + "epoch": 0.10263735498031565, + "grad_norm": 0.7228001449563164, + "learning_rate": 6.837648944024951e-07, + "loss": 1.6582, + "step": 1473 + }, + { + "epoch": 0.10270703410793297, + "grad_norm": 0.7070507373346999, + "learning_rate": 6.837430349874693e-07, + "loss": 1.5626, + "step": 1474 + }, + { + "epoch": 0.1027767132355503, + "grad_norm": 0.6647571817731183, + "learning_rate": 6.837211612560636e-07, + "loss": 1.4195, + "step": 1475 + }, + { + "epoch": 0.10284639236316762, + "grad_norm": 0.7221508915094103, + "learning_rate": 6.836992732093267e-07, + "loss": 1.6211, + "step": 1476 + }, + { + "epoch": 0.10291607149078494, + "grad_norm": 0.7109611055734725, + "learning_rate": 6.836773708483076e-07, + "loss": 1.5389, + "step": 1477 + }, + { + "epoch": 0.10298575061840226, + "grad_norm": 0.6907251786674505, + "learning_rate": 6.836554541740556e-07, + "loss": 1.5164, + "step": 1478 + }, + { + "epoch": 0.10305542974601958, + "grad_norm": 0.699620393769317, + "learning_rate": 6.836335231876212e-07, + "loss": 1.7227, + "step": 1479 + }, + { + "epoch": 0.1031251088736369, + "grad_norm": 0.6940924493567484, + "learning_rate": 6.836115778900552e-07, + "loss": 1.5148, + "step": 1480 + }, + { + "epoch": 0.10319478800125423, + "grad_norm": 0.7238715870249767, + "learning_rate": 6.835896182824093e-07, + "loss": 1.5669, + "step": 1481 + }, + { + "epoch": 0.10326446712887155, + "grad_norm": 0.7088044163423685, + "learning_rate": 6.83567644365736e-07, + "loss": 1.4448, + "step": 1482 + }, + { + "epoch": 0.10333414625648887, + "grad_norm": 0.7447083571913127, + "learning_rate": 6.835456561410882e-07, + "loss": 1.6735, + "step": 1483 + }, + { + "epoch": 0.1034038253841062, + "grad_norm": 0.6910524206838882, + "learning_rate": 6.835236536095197e-07, + "loss": 1.5637, + "step": 1484 + }, + { + "epoch": 0.10347350451172352, + "grad_norm": 0.6593429324828214, + "learning_rate": 6.835016367720847e-07, + "loss": 1.4477, + "step": 1485 + }, + { + "epoch": 0.10354318363934084, + "grad_norm": 0.7235635260040205, + "learning_rate": 6.834796056298386e-07, + "loss": 1.5663, + "step": 1486 + }, + { + "epoch": 0.10361286276695816, + "grad_norm": 0.6955384974930857, + "learning_rate": 6.83457560183837e-07, + "loss": 1.5034, + "step": 1487 + }, + { + "epoch": 0.10368254189457549, + "grad_norm": 0.6869816822781529, + "learning_rate": 6.834355004351363e-07, + "loss": 1.5124, + "step": 1488 + }, + { + "epoch": 0.10375222102219281, + "grad_norm": 0.7239903773603513, + "learning_rate": 6.834134263847939e-07, + "loss": 1.5423, + "step": 1489 + }, + { + "epoch": 0.10382190014981013, + "grad_norm": 0.7255177486951737, + "learning_rate": 6.833913380338675e-07, + "loss": 1.468, + "step": 1490 + }, + { + "epoch": 0.10389157927742744, + "grad_norm": 0.6935427630827753, + "learning_rate": 6.833692353834154e-07, + "loss": 1.5646, + "step": 1491 + }, + { + "epoch": 0.10396125840504476, + "grad_norm": 0.78289308346883, + "learning_rate": 6.833471184344971e-07, + "loss": 1.61, + "step": 1492 + }, + { + "epoch": 0.10403093753266208, + "grad_norm": 0.7540578067091052, + "learning_rate": 6.833249871881725e-07, + "loss": 1.6918, + "step": 1493 + }, + { + "epoch": 0.10410061666027941, + "grad_norm": 0.6723185719673338, + "learning_rate": 6.833028416455019e-07, + "loss": 1.5178, + "step": 1494 + }, + { + "epoch": 0.10417029578789673, + "grad_norm": 0.7225132128946964, + "learning_rate": 6.832806818075467e-07, + "loss": 1.5849, + "step": 1495 + }, + { + "epoch": 0.10423997491551405, + "grad_norm": 0.7256132910891259, + "learning_rate": 6.83258507675369e-07, + "loss": 1.6121, + "step": 1496 + }, + { + "epoch": 0.10430965404313138, + "grad_norm": 0.7833180897554166, + "learning_rate": 6.832363192500312e-07, + "loss": 1.5221, + "step": 1497 + }, + { + "epoch": 0.1043793331707487, + "grad_norm": 0.7612167370203573, + "learning_rate": 6.832141165325967e-07, + "loss": 1.5844, + "step": 1498 + }, + { + "epoch": 0.10444901229836602, + "grad_norm": 0.7410871801404689, + "learning_rate": 6.831918995241296e-07, + "loss": 1.6095, + "step": 1499 + }, + { + "epoch": 0.10451869142598334, + "grad_norm": 0.7017298260754218, + "learning_rate": 6.831696682256944e-07, + "loss": 1.511, + "step": 1500 + }, + { + "epoch": 0.10458837055360067, + "grad_norm": 0.7339141096335122, + "learning_rate": 6.831474226383567e-07, + "loss": 1.6158, + "step": 1501 + }, + { + "epoch": 0.10465804968121799, + "grad_norm": 0.7315827206339433, + "learning_rate": 6.831251627631824e-07, + "loss": 1.5025, + "step": 1502 + }, + { + "epoch": 0.10472772880883531, + "grad_norm": 0.8214608688213216, + "learning_rate": 6.831028886012382e-07, + "loss": 1.4651, + "step": 1503 + }, + { + "epoch": 0.10479740793645263, + "grad_norm": 0.7050091177175483, + "learning_rate": 6.830806001535916e-07, + "loss": 1.5367, + "step": 1504 + }, + { + "epoch": 0.10486708706406996, + "grad_norm": 0.7055266701449938, + "learning_rate": 6.830582974213108e-07, + "loss": 1.5644, + "step": 1505 + }, + { + "epoch": 0.10493676619168728, + "grad_norm": 0.7094965831953183, + "learning_rate": 6.830359804054643e-07, + "loss": 1.6474, + "step": 1506 + }, + { + "epoch": 0.1050064453193046, + "grad_norm": 0.6640006713070633, + "learning_rate": 6.83013649107122e-07, + "loss": 1.3401, + "step": 1507 + }, + { + "epoch": 0.10507612444692192, + "grad_norm": 0.7105850301509256, + "learning_rate": 6.829913035273536e-07, + "loss": 1.455, + "step": 1508 + }, + { + "epoch": 0.10514580357453925, + "grad_norm": 0.6867742081585059, + "learning_rate": 6.829689436672302e-07, + "loss": 1.2886, + "step": 1509 + }, + { + "epoch": 0.10521548270215657, + "grad_norm": 0.6958585661416363, + "learning_rate": 6.829465695278233e-07, + "loss": 1.5809, + "step": 1510 + }, + { + "epoch": 0.10528516182977389, + "grad_norm": 0.7797334011110041, + "learning_rate": 6.829241811102052e-07, + "loss": 1.474, + "step": 1511 + }, + { + "epoch": 0.10535484095739121, + "grad_norm": 0.6753774832924908, + "learning_rate": 6.829017784154487e-07, + "loss": 1.5347, + "step": 1512 + }, + { + "epoch": 0.10542452008500854, + "grad_norm": 0.6657324139543032, + "learning_rate": 6.828793614446273e-07, + "loss": 1.4748, + "step": 1513 + }, + { + "epoch": 0.10549419921262586, + "grad_norm": 0.7217234617981039, + "learning_rate": 6.828569301988155e-07, + "loss": 1.5803, + "step": 1514 + }, + { + "epoch": 0.10556387834024318, + "grad_norm": 0.7292235276823041, + "learning_rate": 6.82834484679088e-07, + "loss": 1.4454, + "step": 1515 + }, + { + "epoch": 0.1056335574678605, + "grad_norm": 0.6526371966253907, + "learning_rate": 6.828120248865206e-07, + "loss": 1.415, + "step": 1516 + }, + { + "epoch": 0.10570323659547783, + "grad_norm": 0.7198666759857738, + "learning_rate": 6.827895508221897e-07, + "loss": 1.5559, + "step": 1517 + }, + { + "epoch": 0.10577291572309515, + "grad_norm": 0.7628274611446516, + "learning_rate": 6.827670624871721e-07, + "loss": 1.5228, + "step": 1518 + }, + { + "epoch": 0.10584259485071247, + "grad_norm": 0.6985806196410942, + "learning_rate": 6.827445598825453e-07, + "loss": 1.5399, + "step": 1519 + }, + { + "epoch": 0.1059122739783298, + "grad_norm": 0.7268289166229313, + "learning_rate": 6.827220430093882e-07, + "loss": 1.5711, + "step": 1520 + }, + { + "epoch": 0.10598195310594712, + "grad_norm": 0.7159822726292894, + "learning_rate": 6.826995118687796e-07, + "loss": 1.3665, + "step": 1521 + }, + { + "epoch": 0.10605163223356444, + "grad_norm": 0.6799598821150186, + "learning_rate": 6.826769664617991e-07, + "loss": 1.5734, + "step": 1522 + }, + { + "epoch": 0.10612131136118176, + "grad_norm": 0.6759158480037062, + "learning_rate": 6.826544067895273e-07, + "loss": 1.5268, + "step": 1523 + }, + { + "epoch": 0.10619099048879908, + "grad_norm": 0.6764740596037916, + "learning_rate": 6.826318328530453e-07, + "loss": 1.5798, + "step": 1524 + }, + { + "epoch": 0.1062606696164164, + "grad_norm": 0.7544503094902575, + "learning_rate": 6.826092446534348e-07, + "loss": 1.6701, + "step": 1525 + }, + { + "epoch": 0.10633034874403373, + "grad_norm": 0.7599345785560571, + "learning_rate": 6.825866421917783e-07, + "loss": 1.5108, + "step": 1526 + }, + { + "epoch": 0.10640002787165105, + "grad_norm": 0.6887529792956971, + "learning_rate": 6.82564025469159e-07, + "loss": 1.4195, + "step": 1527 + }, + { + "epoch": 0.10646970699926837, + "grad_norm": 0.701570365983734, + "learning_rate": 6.825413944866607e-07, + "loss": 1.5902, + "step": 1528 + }, + { + "epoch": 0.1065393861268857, + "grad_norm": 0.6851192165134431, + "learning_rate": 6.825187492453679e-07, + "loss": 1.5846, + "step": 1529 + }, + { + "epoch": 0.10660906525450302, + "grad_norm": 0.6547795464181767, + "learning_rate": 6.82496089746366e-07, + "loss": 1.3953, + "step": 1530 + }, + { + "epoch": 0.10667874438212034, + "grad_norm": 0.7023921604836459, + "learning_rate": 6.824734159907405e-07, + "loss": 1.6031, + "step": 1531 + }, + { + "epoch": 0.10674842350973766, + "grad_norm": 0.7383875135991188, + "learning_rate": 6.824507279795784e-07, + "loss": 1.5285, + "step": 1532 + }, + { + "epoch": 0.10681810263735499, + "grad_norm": 0.7322590444258, + "learning_rate": 6.824280257139667e-07, + "loss": 1.4706, + "step": 1533 + }, + { + "epoch": 0.10688778176497231, + "grad_norm": 0.6595049248009317, + "learning_rate": 6.824053091949933e-07, + "loss": 1.4327, + "step": 1534 + }, + { + "epoch": 0.10695746089258963, + "grad_norm": 0.7165800718934179, + "learning_rate": 6.823825784237471e-07, + "loss": 1.5829, + "step": 1535 + }, + { + "epoch": 0.10702714002020695, + "grad_norm": 0.704731120016156, + "learning_rate": 6.82359833401317e-07, + "loss": 1.5255, + "step": 1536 + }, + { + "epoch": 0.10709681914782426, + "grad_norm": 0.6997247632243365, + "learning_rate": 6.823370741287933e-07, + "loss": 1.6407, + "step": 1537 + }, + { + "epoch": 0.10716649827544159, + "grad_norm": 0.7023753654540732, + "learning_rate": 6.823143006072667e-07, + "loss": 1.5886, + "step": 1538 + }, + { + "epoch": 0.10723617740305891, + "grad_norm": 0.6743001705870307, + "learning_rate": 6.822915128378284e-07, + "loss": 1.572, + "step": 1539 + }, + { + "epoch": 0.10730585653067623, + "grad_norm": 0.6750188030153429, + "learning_rate": 6.822687108215704e-07, + "loss": 1.5842, + "step": 1540 + }, + { + "epoch": 0.10737553565829355, + "grad_norm": 0.7262010807988295, + "learning_rate": 6.822458945595856e-07, + "loss": 1.5688, + "step": 1541 + }, + { + "epoch": 0.10744521478591088, + "grad_norm": 0.7212282946377836, + "learning_rate": 6.822230640529671e-07, + "loss": 1.5317, + "step": 1542 + }, + { + "epoch": 0.1075148939135282, + "grad_norm": 0.8885724857888603, + "learning_rate": 6.822002193028095e-07, + "loss": 1.6592, + "step": 1543 + }, + { + "epoch": 0.10758457304114552, + "grad_norm": 0.7197806691400269, + "learning_rate": 6.82177360310207e-07, + "loss": 1.5434, + "step": 1544 + }, + { + "epoch": 0.10765425216876284, + "grad_norm": 0.7558178313589539, + "learning_rate": 6.821544870762554e-07, + "loss": 1.6391, + "step": 1545 + }, + { + "epoch": 0.10772393129638017, + "grad_norm": 0.7987854415241106, + "learning_rate": 6.821315996020506e-07, + "loss": 1.5524, + "step": 1546 + }, + { + "epoch": 0.10779361042399749, + "grad_norm": 0.6686455155160611, + "learning_rate": 6.821086978886897e-07, + "loss": 1.4751, + "step": 1547 + }, + { + "epoch": 0.10786328955161481, + "grad_norm": 0.7341664427885704, + "learning_rate": 6.8208578193727e-07, + "loss": 1.5269, + "step": 1548 + }, + { + "epoch": 0.10793296867923213, + "grad_norm": 0.7003223328477466, + "learning_rate": 6.820628517488898e-07, + "loss": 1.5568, + "step": 1549 + }, + { + "epoch": 0.10800264780684946, + "grad_norm": 0.6864701685410314, + "learning_rate": 6.820399073246477e-07, + "loss": 1.4843, + "step": 1550 + }, + { + "epoch": 0.10807232693446678, + "grad_norm": 0.8687409341573422, + "learning_rate": 6.820169486656435e-07, + "loss": 1.5423, + "step": 1551 + }, + { + "epoch": 0.1081420060620841, + "grad_norm": 0.7607151628672985, + "learning_rate": 6.819939757729774e-07, + "loss": 1.5437, + "step": 1552 + }, + { + "epoch": 0.10821168518970142, + "grad_norm": 0.7048985366339954, + "learning_rate": 6.819709886477503e-07, + "loss": 1.5318, + "step": 1553 + }, + { + "epoch": 0.10828136431731875, + "grad_norm": 0.7314286735099996, + "learning_rate": 6.819479872910638e-07, + "loss": 1.5726, + "step": 1554 + }, + { + "epoch": 0.10835104344493607, + "grad_norm": 0.6962507248591363, + "learning_rate": 6.8192497170402e-07, + "loss": 1.4723, + "step": 1555 + }, + { + "epoch": 0.10842072257255339, + "grad_norm": 0.7785074386756621, + "learning_rate": 6.819019418877221e-07, + "loss": 1.6421, + "step": 1556 + }, + { + "epoch": 0.10849040170017071, + "grad_norm": 0.7954582117985666, + "learning_rate": 6.818788978432735e-07, + "loss": 1.664, + "step": 1557 + }, + { + "epoch": 0.10856008082778804, + "grad_norm": 0.7367228514312243, + "learning_rate": 6.818558395717786e-07, + "loss": 1.5606, + "step": 1558 + }, + { + "epoch": 0.10862975995540536, + "grad_norm": 0.745909368111922, + "learning_rate": 6.818327670743425e-07, + "loss": 1.4566, + "step": 1559 + }, + { + "epoch": 0.10869943908302268, + "grad_norm": 0.7600253418884132, + "learning_rate": 6.818096803520709e-07, + "loss": 1.6048, + "step": 1560 + }, + { + "epoch": 0.10876911821064, + "grad_norm": 0.7373741671607602, + "learning_rate": 6.817865794060699e-07, + "loss": 1.423, + "step": 1561 + }, + { + "epoch": 0.10883879733825733, + "grad_norm": 0.6896732729151899, + "learning_rate": 6.817634642374468e-07, + "loss": 1.5513, + "step": 1562 + }, + { + "epoch": 0.10890847646587465, + "grad_norm": 0.6983623496494458, + "learning_rate": 6.817403348473094e-07, + "loss": 1.5651, + "step": 1563 + }, + { + "epoch": 0.10897815559349197, + "grad_norm": 0.7525447676994738, + "learning_rate": 6.817171912367657e-07, + "loss": 1.6091, + "step": 1564 + }, + { + "epoch": 0.1090478347211093, + "grad_norm": 0.7532040188348436, + "learning_rate": 6.816940334069252e-07, + "loss": 1.5547, + "step": 1565 + }, + { + "epoch": 0.10911751384872662, + "grad_norm": 0.7377763746387574, + "learning_rate": 6.816708613588975e-07, + "loss": 1.6717, + "step": 1566 + }, + { + "epoch": 0.10918719297634394, + "grad_norm": 0.7404993105111815, + "learning_rate": 6.816476750937931e-07, + "loss": 1.5567, + "step": 1567 + }, + { + "epoch": 0.10925687210396126, + "grad_norm": 0.6607513401827204, + "learning_rate": 6.816244746127231e-07, + "loss": 1.4863, + "step": 1568 + }, + { + "epoch": 0.10932655123157858, + "grad_norm": 0.7327524646568696, + "learning_rate": 6.816012599167993e-07, + "loss": 1.4312, + "step": 1569 + }, + { + "epoch": 0.1093962303591959, + "grad_norm": 0.7166557657811868, + "learning_rate": 6.815780310071341e-07, + "loss": 1.5816, + "step": 1570 + }, + { + "epoch": 0.10946590948681323, + "grad_norm": 0.695293401858971, + "learning_rate": 6.81554787884841e-07, + "loss": 1.5191, + "step": 1571 + }, + { + "epoch": 0.10953558861443055, + "grad_norm": 0.7131918332055402, + "learning_rate": 6.815315305510336e-07, + "loss": 1.6284, + "step": 1572 + }, + { + "epoch": 0.10960526774204787, + "grad_norm": 0.7076583193113377, + "learning_rate": 6.815082590068264e-07, + "loss": 1.535, + "step": 1573 + }, + { + "epoch": 0.1096749468696652, + "grad_norm": 0.7135512952756091, + "learning_rate": 6.814849732533347e-07, + "loss": 1.5333, + "step": 1574 + }, + { + "epoch": 0.10974462599728252, + "grad_norm": 0.7538494298477586, + "learning_rate": 6.814616732916744e-07, + "loss": 1.4118, + "step": 1575 + }, + { + "epoch": 0.10981430512489984, + "grad_norm": 0.7178907661850447, + "learning_rate": 6.814383591229622e-07, + "loss": 1.3534, + "step": 1576 + }, + { + "epoch": 0.10988398425251716, + "grad_norm": 0.7308851126536335, + "learning_rate": 6.814150307483151e-07, + "loss": 1.5599, + "step": 1577 + }, + { + "epoch": 0.10995366338013449, + "grad_norm": 0.7669543415808457, + "learning_rate": 6.813916881688513e-07, + "loss": 1.54, + "step": 1578 + }, + { + "epoch": 0.11002334250775181, + "grad_norm": 0.8240784223589613, + "learning_rate": 6.813683313856894e-07, + "loss": 1.5358, + "step": 1579 + }, + { + "epoch": 0.11009302163536913, + "grad_norm": 0.7221931116839967, + "learning_rate": 6.813449603999485e-07, + "loss": 1.6244, + "step": 1580 + }, + { + "epoch": 0.11016270076298645, + "grad_norm": 0.7189519304364052, + "learning_rate": 6.813215752127488e-07, + "loss": 1.4958, + "step": 1581 + }, + { + "epoch": 0.11023237989060376, + "grad_norm": 0.7511549610377619, + "learning_rate": 6.812981758252108e-07, + "loss": 1.6661, + "step": 1582 + }, + { + "epoch": 0.11030205901822109, + "grad_norm": 0.7497943083182442, + "learning_rate": 6.81274762238456e-07, + "loss": 1.5513, + "step": 1583 + }, + { + "epoch": 0.11037173814583841, + "grad_norm": 0.7112454154563912, + "learning_rate": 6.812513344536063e-07, + "loss": 1.5281, + "step": 1584 + }, + { + "epoch": 0.11044141727345573, + "grad_norm": 0.7163312744702844, + "learning_rate": 6.812278924717844e-07, + "loss": 1.5135, + "step": 1585 + }, + { + "epoch": 0.11051109640107305, + "grad_norm": 0.7529791816288604, + "learning_rate": 6.812044362941139e-07, + "loss": 1.6745, + "step": 1586 + }, + { + "epoch": 0.11058077552869038, + "grad_norm": 0.6813105813519863, + "learning_rate": 6.811809659217186e-07, + "loss": 1.3124, + "step": 1587 + }, + { + "epoch": 0.1106504546563077, + "grad_norm": 0.6772757264130072, + "learning_rate": 6.811574813557234e-07, + "loss": 1.5298, + "step": 1588 + }, + { + "epoch": 0.11072013378392502, + "grad_norm": 0.6824509840298503, + "learning_rate": 6.811339825972538e-07, + "loss": 1.6369, + "step": 1589 + }, + { + "epoch": 0.11078981291154234, + "grad_norm": 0.7296519362331932, + "learning_rate": 6.811104696474356e-07, + "loss": 1.5423, + "step": 1590 + }, + { + "epoch": 0.11085949203915967, + "grad_norm": 0.718482315453647, + "learning_rate": 6.810869425073959e-07, + "loss": 1.6452, + "step": 1591 + }, + { + "epoch": 0.11092917116677699, + "grad_norm": 0.722378080045864, + "learning_rate": 6.81063401178262e-07, + "loss": 1.4309, + "step": 1592 + }, + { + "epoch": 0.11099885029439431, + "grad_norm": 0.6985374571080087, + "learning_rate": 6.810398456611623e-07, + "loss": 1.5046, + "step": 1593 + }, + { + "epoch": 0.11106852942201163, + "grad_norm": 0.7735897379532881, + "learning_rate": 6.810162759572252e-07, + "loss": 1.4598, + "step": 1594 + }, + { + "epoch": 0.11113820854962896, + "grad_norm": 0.7141646454049914, + "learning_rate": 6.809926920675806e-07, + "loss": 1.4788, + "step": 1595 + }, + { + "epoch": 0.11120788767724628, + "grad_norm": 0.7004030395772681, + "learning_rate": 6.809690939933585e-07, + "loss": 1.5618, + "step": 1596 + }, + { + "epoch": 0.1112775668048636, + "grad_norm": 0.7195195106477383, + "learning_rate": 6.809454817356897e-07, + "loss": 1.5963, + "step": 1597 + }, + { + "epoch": 0.11134724593248092, + "grad_norm": 0.6591270162532643, + "learning_rate": 6.80921855295706e-07, + "loss": 1.4725, + "step": 1598 + }, + { + "epoch": 0.11141692506009825, + "grad_norm": 0.7034075202200033, + "learning_rate": 6.808982146745393e-07, + "loss": 1.5809, + "step": 1599 + }, + { + "epoch": 0.11148660418771557, + "grad_norm": 0.677382514036192, + "learning_rate": 6.808745598733229e-07, + "loss": 1.4277, + "step": 1600 + }, + { + "epoch": 0.11155628331533289, + "grad_norm": 0.6988131968542942, + "learning_rate": 6.8085089089319e-07, + "loss": 1.4795, + "step": 1601 + }, + { + "epoch": 0.11162596244295021, + "grad_norm": 0.6558486580469053, + "learning_rate": 6.808272077352751e-07, + "loss": 1.5151, + "step": 1602 + }, + { + "epoch": 0.11169564157056754, + "grad_norm": 0.7466317982625242, + "learning_rate": 6.808035104007131e-07, + "loss": 1.5656, + "step": 1603 + }, + { + "epoch": 0.11176532069818486, + "grad_norm": 0.7023736399262505, + "learning_rate": 6.807797988906397e-07, + "loss": 1.5659, + "step": 1604 + }, + { + "epoch": 0.11183499982580218, + "grad_norm": 0.6949112857794784, + "learning_rate": 6.807560732061909e-07, + "loss": 1.6362, + "step": 1605 + }, + { + "epoch": 0.1119046789534195, + "grad_norm": 0.7026342000850446, + "learning_rate": 6.807323333485041e-07, + "loss": 1.4273, + "step": 1606 + }, + { + "epoch": 0.11197435808103683, + "grad_norm": 0.6892676901830094, + "learning_rate": 6.807085793187167e-07, + "loss": 1.6011, + "step": 1607 + }, + { + "epoch": 0.11204403720865415, + "grad_norm": 0.7217360369844035, + "learning_rate": 6.80684811117967e-07, + "loss": 1.5082, + "step": 1608 + }, + { + "epoch": 0.11211371633627147, + "grad_norm": 0.7305733477997847, + "learning_rate": 6.806610287473942e-07, + "loss": 1.4838, + "step": 1609 + }, + { + "epoch": 0.1121833954638888, + "grad_norm": 0.7356577124965377, + "learning_rate": 6.806372322081379e-07, + "loss": 1.5469, + "step": 1610 + }, + { + "epoch": 0.11225307459150612, + "grad_norm": 0.678810220355509, + "learning_rate": 6.806134215013386e-07, + "loss": 1.4224, + "step": 1611 + }, + { + "epoch": 0.11232275371912344, + "grad_norm": 0.7015202486376189, + "learning_rate": 6.80589596628137e-07, + "loss": 1.5777, + "step": 1612 + }, + { + "epoch": 0.11239243284674076, + "grad_norm": 0.7235445572079848, + "learning_rate": 6.805657575896753e-07, + "loss": 1.5127, + "step": 1613 + }, + { + "epoch": 0.11246211197435808, + "grad_norm": 0.7302277793966115, + "learning_rate": 6.805419043870957e-07, + "loss": 1.5286, + "step": 1614 + }, + { + "epoch": 0.1125317911019754, + "grad_norm": 0.7486458593950519, + "learning_rate": 6.805180370215413e-07, + "loss": 1.6699, + "step": 1615 + }, + { + "epoch": 0.11260147022959273, + "grad_norm": 0.7586164782838338, + "learning_rate": 6.804941554941558e-07, + "loss": 1.5705, + "step": 1616 + }, + { + "epoch": 0.11267114935721005, + "grad_norm": 0.7399086186510679, + "learning_rate": 6.804702598060838e-07, + "loss": 1.6787, + "step": 1617 + }, + { + "epoch": 0.11274082848482737, + "grad_norm": 0.6983175095680024, + "learning_rate": 6.804463499584704e-07, + "loss": 1.7091, + "step": 1618 + }, + { + "epoch": 0.1128105076124447, + "grad_norm": 0.6907069923114252, + "learning_rate": 6.804224259524613e-07, + "loss": 1.6168, + "step": 1619 + }, + { + "epoch": 0.11288018674006202, + "grad_norm": 0.7384786246263818, + "learning_rate": 6.803984877892031e-07, + "loss": 1.516, + "step": 1620 + }, + { + "epoch": 0.11294986586767934, + "grad_norm": 0.7134317876278762, + "learning_rate": 6.80374535469843e-07, + "loss": 1.4856, + "step": 1621 + }, + { + "epoch": 0.11301954499529666, + "grad_norm": 0.7270747064059724, + "learning_rate": 6.803505689955286e-07, + "loss": 1.5908, + "step": 1622 + }, + { + "epoch": 0.11308922412291399, + "grad_norm": 0.7517895848474664, + "learning_rate": 6.803265883674087e-07, + "loss": 1.5437, + "step": 1623 + }, + { + "epoch": 0.11315890325053131, + "grad_norm": 0.6932504297112048, + "learning_rate": 6.803025935866324e-07, + "loss": 1.5541, + "step": 1624 + }, + { + "epoch": 0.11322858237814863, + "grad_norm": 0.7251577977979223, + "learning_rate": 6.802785846543495e-07, + "loss": 1.566, + "step": 1625 + }, + { + "epoch": 0.11329826150576595, + "grad_norm": 0.7127719443354646, + "learning_rate": 6.802545615717106e-07, + "loss": 1.6241, + "step": 1626 + }, + { + "epoch": 0.11336794063338328, + "grad_norm": 0.7287839552408284, + "learning_rate": 6.80230524339867e-07, + "loss": 1.6895, + "step": 1627 + }, + { + "epoch": 0.11343761976100059, + "grad_norm": 0.7003344394380433, + "learning_rate": 6.802064729599706e-07, + "loss": 1.5245, + "step": 1628 + }, + { + "epoch": 0.11350729888861791, + "grad_norm": 0.7174216722950388, + "learning_rate": 6.80182407433174e-07, + "loss": 1.6864, + "step": 1629 + }, + { + "epoch": 0.11357697801623523, + "grad_norm": 0.6850433524557609, + "learning_rate": 6.801583277606304e-07, + "loss": 1.5079, + "step": 1630 + }, + { + "epoch": 0.11364665714385255, + "grad_norm": 0.6871563613458431, + "learning_rate": 6.801342339434937e-07, + "loss": 1.4594, + "step": 1631 + }, + { + "epoch": 0.11371633627146988, + "grad_norm": 0.7170332340740067, + "learning_rate": 6.801101259829188e-07, + "loss": 1.656, + "step": 1632 + }, + { + "epoch": 0.1137860153990872, + "grad_norm": 0.7164452063926988, + "learning_rate": 6.800860038800607e-07, + "loss": 1.5006, + "step": 1633 + }, + { + "epoch": 0.11385569452670452, + "grad_norm": 0.6937900817795392, + "learning_rate": 6.800618676360755e-07, + "loss": 1.4742, + "step": 1634 + }, + { + "epoch": 0.11392537365432184, + "grad_norm": 0.7023556028757573, + "learning_rate": 6.800377172521199e-07, + "loss": 1.6291, + "step": 1635 + }, + { + "epoch": 0.11399505278193917, + "grad_norm": 0.7169038482009281, + "learning_rate": 6.800135527293511e-07, + "loss": 1.5947, + "step": 1636 + }, + { + "epoch": 0.11406473190955649, + "grad_norm": 0.7442205747389822, + "learning_rate": 6.799893740689272e-07, + "loss": 1.5385, + "step": 1637 + }, + { + "epoch": 0.11413441103717381, + "grad_norm": 0.6974269350674046, + "learning_rate": 6.79965181272007e-07, + "loss": 1.44, + "step": 1638 + }, + { + "epoch": 0.11420409016479113, + "grad_norm": 0.7202760860472499, + "learning_rate": 6.799409743397497e-07, + "loss": 1.5094, + "step": 1639 + }, + { + "epoch": 0.11427376929240846, + "grad_norm": 0.7228977897934166, + "learning_rate": 6.799167532733153e-07, + "loss": 1.7074, + "step": 1640 + }, + { + "epoch": 0.11434344842002578, + "grad_norm": 0.7422510209307608, + "learning_rate": 6.798925180738649e-07, + "loss": 1.4843, + "step": 1641 + }, + { + "epoch": 0.1144131275476431, + "grad_norm": 0.7648101870166034, + "learning_rate": 6.798682687425594e-07, + "loss": 1.6436, + "step": 1642 + }, + { + "epoch": 0.11448280667526042, + "grad_norm": 0.7794274991636997, + "learning_rate": 6.798440052805611e-07, + "loss": 1.6088, + "step": 1643 + }, + { + "epoch": 0.11455248580287775, + "grad_norm": 0.7468761912839483, + "learning_rate": 6.79819727689033e-07, + "loss": 1.6533, + "step": 1644 + }, + { + "epoch": 0.11462216493049507, + "grad_norm": 0.674230679158213, + "learning_rate": 6.79795435969138e-07, + "loss": 1.5058, + "step": 1645 + }, + { + "epoch": 0.11469184405811239, + "grad_norm": 0.7430350345094352, + "learning_rate": 6.797711301220406e-07, + "loss": 1.545, + "step": 1646 + }, + { + "epoch": 0.11476152318572971, + "grad_norm": 0.7276113930617449, + "learning_rate": 6.797468101489056e-07, + "loss": 1.5441, + "step": 1647 + }, + { + "epoch": 0.11483120231334704, + "grad_norm": 0.7398915067049099, + "learning_rate": 6.797224760508984e-07, + "loss": 1.6512, + "step": 1648 + }, + { + "epoch": 0.11490088144096436, + "grad_norm": 0.68940533642839, + "learning_rate": 6.796981278291849e-07, + "loss": 1.6151, + "step": 1649 + }, + { + "epoch": 0.11497056056858168, + "grad_norm": 0.7555918014240699, + "learning_rate": 6.796737654849322e-07, + "loss": 1.5668, + "step": 1650 + }, + { + "epoch": 0.115040239696199, + "grad_norm": 0.7099296673799076, + "learning_rate": 6.796493890193077e-07, + "loss": 1.4325, + "step": 1651 + }, + { + "epoch": 0.11510991882381633, + "grad_norm": 0.7595492751199421, + "learning_rate": 6.796249984334797e-07, + "loss": 1.6228, + "step": 1652 + }, + { + "epoch": 0.11517959795143365, + "grad_norm": 0.7632915208578668, + "learning_rate": 6.796005937286167e-07, + "loss": 1.5813, + "step": 1653 + }, + { + "epoch": 0.11524927707905097, + "grad_norm": 0.6922449395291023, + "learning_rate": 6.795761749058885e-07, + "loss": 1.5088, + "step": 1654 + }, + { + "epoch": 0.1153189562066683, + "grad_norm": 0.7099657991630075, + "learning_rate": 6.795517419664653e-07, + "loss": 1.5136, + "step": 1655 + }, + { + "epoch": 0.11538863533428562, + "grad_norm": 0.7613853045497073, + "learning_rate": 6.795272949115179e-07, + "loss": 1.538, + "step": 1656 + }, + { + "epoch": 0.11545831446190294, + "grad_norm": 0.7195462148384297, + "learning_rate": 6.795028337422179e-07, + "loss": 1.6404, + "step": 1657 + }, + { + "epoch": 0.11552799358952026, + "grad_norm": 0.6747873760990837, + "learning_rate": 6.794783584597375e-07, + "loss": 1.5352, + "step": 1658 + }, + { + "epoch": 0.11559767271713758, + "grad_norm": 0.6519250279592536, + "learning_rate": 6.794538690652497e-07, + "loss": 1.5127, + "step": 1659 + }, + { + "epoch": 0.1156673518447549, + "grad_norm": 0.7162973540123703, + "learning_rate": 6.794293655599279e-07, + "loss": 1.4985, + "step": 1660 + }, + { + "epoch": 0.11573703097237223, + "grad_norm": 0.7358283562766068, + "learning_rate": 6.794048479449463e-07, + "loss": 1.4902, + "step": 1661 + }, + { + "epoch": 0.11580671009998955, + "grad_norm": 0.718594579591006, + "learning_rate": 6.793803162214801e-07, + "loss": 1.5252, + "step": 1662 + }, + { + "epoch": 0.11587638922760687, + "grad_norm": 0.6804222879849213, + "learning_rate": 6.793557703907049e-07, + "loss": 1.4665, + "step": 1663 + }, + { + "epoch": 0.1159460683552242, + "grad_norm": 0.6597234535748099, + "learning_rate": 6.793312104537968e-07, + "loss": 1.5345, + "step": 1664 + }, + { + "epoch": 0.11601574748284152, + "grad_norm": 0.6858789541075233, + "learning_rate": 6.793066364119327e-07, + "loss": 1.3725, + "step": 1665 + }, + { + "epoch": 0.11608542661045884, + "grad_norm": 0.7023154283430406, + "learning_rate": 6.792820482662906e-07, + "loss": 1.6363, + "step": 1666 + }, + { + "epoch": 0.11615510573807616, + "grad_norm": 0.6987195535137968, + "learning_rate": 6.792574460180486e-07, + "loss": 1.5657, + "step": 1667 + }, + { + "epoch": 0.11622478486569349, + "grad_norm": 0.6864641357349911, + "learning_rate": 6.792328296683856e-07, + "loss": 1.4052, + "step": 1668 + }, + { + "epoch": 0.11629446399331081, + "grad_norm": 0.7222713490050489, + "learning_rate": 6.792081992184813e-07, + "loss": 1.5152, + "step": 1669 + }, + { + "epoch": 0.11636414312092813, + "grad_norm": 0.7287387894977275, + "learning_rate": 6.791835546695162e-07, + "loss": 1.5372, + "step": 1670 + }, + { + "epoch": 0.11643382224854545, + "grad_norm": 0.7969883890704471, + "learning_rate": 6.791588960226712e-07, + "loss": 1.4504, + "step": 1671 + }, + { + "epoch": 0.11650350137616278, + "grad_norm": 0.6806299876221195, + "learning_rate": 6.79134223279128e-07, + "loss": 1.4165, + "step": 1672 + }, + { + "epoch": 0.1165731805037801, + "grad_norm": 0.7626172771312602, + "learning_rate": 6.791095364400689e-07, + "loss": 1.6344, + "step": 1673 + }, + { + "epoch": 0.11664285963139741, + "grad_norm": 0.7140725948962551, + "learning_rate": 6.790848355066771e-07, + "loss": 1.5458, + "step": 1674 + }, + { + "epoch": 0.11671253875901473, + "grad_norm": 0.7015644476735204, + "learning_rate": 6.790601204801361e-07, + "loss": 1.5793, + "step": 1675 + }, + { + "epoch": 0.11678221788663205, + "grad_norm": 0.7850028189623065, + "learning_rate": 6.790353913616307e-07, + "loss": 1.503, + "step": 1676 + }, + { + "epoch": 0.11685189701424938, + "grad_norm": 0.7580222452334741, + "learning_rate": 6.790106481523455e-07, + "loss": 1.5288, + "step": 1677 + }, + { + "epoch": 0.1169215761418667, + "grad_norm": 0.8206009490254131, + "learning_rate": 6.789858908534665e-07, + "loss": 1.5293, + "step": 1678 + }, + { + "epoch": 0.11699125526948402, + "grad_norm": 0.7647388384602489, + "learning_rate": 6.789611194661801e-07, + "loss": 1.5605, + "step": 1679 + }, + { + "epoch": 0.11706093439710134, + "grad_norm": 0.7159767052729963, + "learning_rate": 6.789363339916733e-07, + "loss": 1.6455, + "step": 1680 + }, + { + "epoch": 0.11713061352471867, + "grad_norm": 0.77035475571459, + "learning_rate": 6.78911534431134e-07, + "loss": 1.6127, + "step": 1681 + }, + { + "epoch": 0.11720029265233599, + "grad_norm": 0.6771402586909827, + "learning_rate": 6.788867207857505e-07, + "loss": 1.4831, + "step": 1682 + }, + { + "epoch": 0.11726997177995331, + "grad_norm": 0.7260329904784459, + "learning_rate": 6.78861893056712e-07, + "loss": 1.5792, + "step": 1683 + }, + { + "epoch": 0.11733965090757063, + "grad_norm": 0.6889949040174258, + "learning_rate": 6.788370512452083e-07, + "loss": 1.5853, + "step": 1684 + }, + { + "epoch": 0.11740933003518796, + "grad_norm": 0.7097739042353229, + "learning_rate": 6.7881219535243e-07, + "loss": 1.6495, + "step": 1685 + }, + { + "epoch": 0.11747900916280528, + "grad_norm": 0.76869404066403, + "learning_rate": 6.78787325379568e-07, + "loss": 1.5965, + "step": 1686 + }, + { + "epoch": 0.1175486882904226, + "grad_norm": 0.7519933050837854, + "learning_rate": 6.787624413278143e-07, + "loss": 1.536, + "step": 1687 + }, + { + "epoch": 0.11761836741803992, + "grad_norm": 0.6665676367181234, + "learning_rate": 6.787375431983613e-07, + "loss": 1.4836, + "step": 1688 + }, + { + "epoch": 0.11768804654565725, + "grad_norm": 0.6976889194202481, + "learning_rate": 6.787126309924023e-07, + "loss": 1.5132, + "step": 1689 + }, + { + "epoch": 0.11775772567327457, + "grad_norm": 0.6997377024825535, + "learning_rate": 6.786877047111309e-07, + "loss": 1.6681, + "step": 1690 + }, + { + "epoch": 0.11782740480089189, + "grad_norm": 0.7204082985724929, + "learning_rate": 6.786627643557416e-07, + "loss": 1.5786, + "step": 1691 + }, + { + "epoch": 0.11789708392850921, + "grad_norm": 0.6949032262013504, + "learning_rate": 6.7863780992743e-07, + "loss": 1.5075, + "step": 1692 + }, + { + "epoch": 0.11796676305612654, + "grad_norm": 0.7113507414950234, + "learning_rate": 6.786128414273917e-07, + "loss": 1.5558, + "step": 1693 + }, + { + "epoch": 0.11803644218374386, + "grad_norm": 0.7369337445932507, + "learning_rate": 6.785878588568232e-07, + "loss": 1.7454, + "step": 1694 + }, + { + "epoch": 0.11810612131136118, + "grad_norm": 0.7139653769172889, + "learning_rate": 6.785628622169219e-07, + "loss": 1.5648, + "step": 1695 + }, + { + "epoch": 0.1181758004389785, + "grad_norm": 0.7636368697269599, + "learning_rate": 6.785378515088854e-07, + "loss": 1.6432, + "step": 1696 + }, + { + "epoch": 0.11824547956659583, + "grad_norm": 0.6979859259064816, + "learning_rate": 6.785128267339125e-07, + "loss": 1.6527, + "step": 1697 + }, + { + "epoch": 0.11831515869421315, + "grad_norm": 0.7449511856669205, + "learning_rate": 6.784877878932024e-07, + "loss": 1.5515, + "step": 1698 + }, + { + "epoch": 0.11838483782183047, + "grad_norm": 0.7035936657977012, + "learning_rate": 6.784627349879551e-07, + "loss": 1.476, + "step": 1699 + }, + { + "epoch": 0.1184545169494478, + "grad_norm": 0.7051512194691226, + "learning_rate": 6.784376680193709e-07, + "loss": 1.5173, + "step": 1700 + }, + { + "epoch": 0.11852419607706512, + "grad_norm": 0.6909050176678639, + "learning_rate": 6.784125869886512e-07, + "loss": 1.5744, + "step": 1701 + }, + { + "epoch": 0.11859387520468244, + "grad_norm": 0.7165979408686158, + "learning_rate": 6.78387491896998e-07, + "loss": 1.4485, + "step": 1702 + }, + { + "epoch": 0.11866355433229976, + "grad_norm": 0.7430454803926803, + "learning_rate": 6.783623827456139e-07, + "loss": 1.601, + "step": 1703 + }, + { + "epoch": 0.11873323345991708, + "grad_norm": 0.7117476169672308, + "learning_rate": 6.783372595357023e-07, + "loss": 1.3338, + "step": 1704 + }, + { + "epoch": 0.1188029125875344, + "grad_norm": 0.7970341950051524, + "learning_rate": 6.783121222684668e-07, + "loss": 1.5163, + "step": 1705 + }, + { + "epoch": 0.11887259171515173, + "grad_norm": 0.7299219462336711, + "learning_rate": 6.782869709451125e-07, + "loss": 1.527, + "step": 1706 + }, + { + "epoch": 0.11894227084276905, + "grad_norm": 0.7140113860452822, + "learning_rate": 6.782618055668442e-07, + "loss": 1.5946, + "step": 1707 + }, + { + "epoch": 0.11901194997038637, + "grad_norm": 0.8280777358865281, + "learning_rate": 6.782366261348682e-07, + "loss": 1.5883, + "step": 1708 + }, + { + "epoch": 0.1190816290980037, + "grad_norm": 0.761620219181658, + "learning_rate": 6.782114326503911e-07, + "loss": 1.5823, + "step": 1709 + }, + { + "epoch": 0.11915130822562102, + "grad_norm": 0.8154486412727795, + "learning_rate": 6.781862251146201e-07, + "loss": 1.639, + "step": 1710 + }, + { + "epoch": 0.11922098735323834, + "grad_norm": 0.729402725558841, + "learning_rate": 6.781610035287634e-07, + "loss": 1.4666, + "step": 1711 + }, + { + "epoch": 0.11929066648085566, + "grad_norm": 0.7089244714070104, + "learning_rate": 6.781357678940296e-07, + "loss": 1.612, + "step": 1712 + }, + { + "epoch": 0.11936034560847299, + "grad_norm": 0.663561610229027, + "learning_rate": 6.781105182116277e-07, + "loss": 1.4386, + "step": 1713 + }, + { + "epoch": 0.11943002473609031, + "grad_norm": 0.7533233166790155, + "learning_rate": 6.780852544827683e-07, + "loss": 1.543, + "step": 1714 + }, + { + "epoch": 0.11949970386370763, + "grad_norm": 0.6484805021471591, + "learning_rate": 6.780599767086617e-07, + "loss": 1.4806, + "step": 1715 + }, + { + "epoch": 0.11956938299132495, + "grad_norm": 0.7670450412169437, + "learning_rate": 6.780346848905196e-07, + "loss": 1.569, + "step": 1716 + }, + { + "epoch": 0.11963906211894228, + "grad_norm": 0.6593231059169545, + "learning_rate": 6.780093790295537e-07, + "loss": 1.4938, + "step": 1717 + }, + { + "epoch": 0.1197087412465596, + "grad_norm": 0.7301933487162761, + "learning_rate": 6.779840591269766e-07, + "loss": 1.5712, + "step": 1718 + }, + { + "epoch": 0.11977842037417692, + "grad_norm": 0.703158568302166, + "learning_rate": 6.779587251840021e-07, + "loss": 1.6006, + "step": 1719 + }, + { + "epoch": 0.11984809950179423, + "grad_norm": 0.7672500156829497, + "learning_rate": 6.779333772018441e-07, + "loss": 1.5412, + "step": 1720 + }, + { + "epoch": 0.11991777862941155, + "grad_norm": 0.7695414524452814, + "learning_rate": 6.779080151817172e-07, + "loss": 1.6895, + "step": 1721 + }, + { + "epoch": 0.11998745775702888, + "grad_norm": 0.677453250708521, + "learning_rate": 6.778826391248369e-07, + "loss": 1.5178, + "step": 1722 + }, + { + "epoch": 0.1200571368846462, + "grad_norm": 0.707782732326732, + "learning_rate": 6.778572490324192e-07, + "loss": 1.5283, + "step": 1723 + }, + { + "epoch": 0.12012681601226352, + "grad_norm": 0.6959585255791797, + "learning_rate": 6.778318449056811e-07, + "loss": 1.4923, + "step": 1724 + }, + { + "epoch": 0.12019649513988084, + "grad_norm": 0.7472007496068623, + "learning_rate": 6.778064267458396e-07, + "loss": 1.5479, + "step": 1725 + }, + { + "epoch": 0.12026617426749817, + "grad_norm": 0.7676470925470961, + "learning_rate": 6.77780994554113e-07, + "loss": 1.62, + "step": 1726 + }, + { + "epoch": 0.12033585339511549, + "grad_norm": 0.6976928226879325, + "learning_rate": 6.777555483317201e-07, + "loss": 1.5113, + "step": 1727 + }, + { + "epoch": 0.12040553252273281, + "grad_norm": 0.748334145125312, + "learning_rate": 6.777300880798806e-07, + "loss": 1.539, + "step": 1728 + }, + { + "epoch": 0.12047521165035013, + "grad_norm": 0.677784509849018, + "learning_rate": 6.777046137998139e-07, + "loss": 1.4158, + "step": 1729 + }, + { + "epoch": 0.12054489077796746, + "grad_norm": 0.6416320964645821, + "learning_rate": 6.776791254927415e-07, + "loss": 1.4458, + "step": 1730 + }, + { + "epoch": 0.12061456990558478, + "grad_norm": 0.7232355551723145, + "learning_rate": 6.776536231598843e-07, + "loss": 1.5007, + "step": 1731 + }, + { + "epoch": 0.1206842490332021, + "grad_norm": 0.6940114252253201, + "learning_rate": 6.776281068024648e-07, + "loss": 1.5597, + "step": 1732 + }, + { + "epoch": 0.12075392816081942, + "grad_norm": 0.766955235541318, + "learning_rate": 6.776025764217057e-07, + "loss": 1.6602, + "step": 1733 + }, + { + "epoch": 0.12082360728843675, + "grad_norm": 0.6672225734828413, + "learning_rate": 6.775770320188304e-07, + "loss": 1.5247, + "step": 1734 + }, + { + "epoch": 0.12089328641605407, + "grad_norm": 0.7381716051530299, + "learning_rate": 6.77551473595063e-07, + "loss": 1.5521, + "step": 1735 + }, + { + "epoch": 0.12096296554367139, + "grad_norm": 0.6818041083316355, + "learning_rate": 6.775259011516285e-07, + "loss": 1.5988, + "step": 1736 + }, + { + "epoch": 0.12103264467128871, + "grad_norm": 0.7532425489494298, + "learning_rate": 6.775003146897523e-07, + "loss": 1.5768, + "step": 1737 + }, + { + "epoch": 0.12110232379890604, + "grad_norm": 0.7004792129217193, + "learning_rate": 6.774747142106604e-07, + "loss": 1.4175, + "step": 1738 + }, + { + "epoch": 0.12117200292652336, + "grad_norm": 0.6979999270979104, + "learning_rate": 6.774490997155799e-07, + "loss": 1.5503, + "step": 1739 + }, + { + "epoch": 0.12124168205414068, + "grad_norm": 0.7309248475402028, + "learning_rate": 6.774234712057381e-07, + "loss": 1.5828, + "step": 1740 + }, + { + "epoch": 0.121311361181758, + "grad_norm": 0.7698670180342181, + "learning_rate": 6.773978286823632e-07, + "loss": 1.6536, + "step": 1741 + }, + { + "epoch": 0.12138104030937533, + "grad_norm": 0.751428111254726, + "learning_rate": 6.773721721466841e-07, + "loss": 1.7044, + "step": 1742 + }, + { + "epoch": 0.12145071943699265, + "grad_norm": 0.6960299296307737, + "learning_rate": 6.773465015999302e-07, + "loss": 1.5352, + "step": 1743 + }, + { + "epoch": 0.12152039856460997, + "grad_norm": 0.7043981294939846, + "learning_rate": 6.773208170433319e-07, + "loss": 1.5553, + "step": 1744 + }, + { + "epoch": 0.1215900776922273, + "grad_norm": 0.7442626020263271, + "learning_rate": 6.772951184781199e-07, + "loss": 1.4759, + "step": 1745 + }, + { + "epoch": 0.12165975681984462, + "grad_norm": 0.7031340655574903, + "learning_rate": 6.772694059055255e-07, + "loss": 1.5555, + "step": 1746 + }, + { + "epoch": 0.12172943594746194, + "grad_norm": 0.7117663767653726, + "learning_rate": 6.772436793267814e-07, + "loss": 1.5111, + "step": 1747 + }, + { + "epoch": 0.12179911507507926, + "grad_norm": 0.6919303229240988, + "learning_rate": 6.772179387431202e-07, + "loss": 1.5557, + "step": 1748 + }, + { + "epoch": 0.12186879420269658, + "grad_norm": 0.7151382209745435, + "learning_rate": 6.771921841557755e-07, + "loss": 1.5395, + "step": 1749 + }, + { + "epoch": 0.1219384733303139, + "grad_norm": 0.7159206817537367, + "learning_rate": 6.771664155659814e-07, + "loss": 1.5215, + "step": 1750 + }, + { + "epoch": 0.12200815245793123, + "grad_norm": 0.7647443834562503, + "learning_rate": 6.771406329749728e-07, + "loss": 1.5614, + "step": 1751 + }, + { + "epoch": 0.12207783158554855, + "grad_norm": 0.7242339466839115, + "learning_rate": 6.771148363839854e-07, + "loss": 1.4939, + "step": 1752 + }, + { + "epoch": 0.12214751071316587, + "grad_norm": 0.6616100019536842, + "learning_rate": 6.770890257942553e-07, + "loss": 1.461, + "step": 1753 + }, + { + "epoch": 0.1222171898407832, + "grad_norm": 0.688560043683067, + "learning_rate": 6.770632012070195e-07, + "loss": 1.5556, + "step": 1754 + }, + { + "epoch": 0.12228686896840052, + "grad_norm": 0.7160076848018615, + "learning_rate": 6.770373626235155e-07, + "loss": 1.5213, + "step": 1755 + }, + { + "epoch": 0.12235654809601784, + "grad_norm": 0.7547550213528791, + "learning_rate": 6.770115100449814e-07, + "loss": 1.6089, + "step": 1756 + }, + { + "epoch": 0.12242622722363516, + "grad_norm": 0.6926471414125789, + "learning_rate": 6.769856434726564e-07, + "loss": 1.6533, + "step": 1757 + }, + { + "epoch": 0.12249590635125249, + "grad_norm": 0.8418470820434377, + "learning_rate": 6.769597629077799e-07, + "loss": 1.616, + "step": 1758 + }, + { + "epoch": 0.12256558547886981, + "grad_norm": 0.7122046268206644, + "learning_rate": 6.769338683515921e-07, + "loss": 1.4934, + "step": 1759 + }, + { + "epoch": 0.12263526460648713, + "grad_norm": 0.6997019866605317, + "learning_rate": 6.76907959805334e-07, + "loss": 1.5801, + "step": 1760 + }, + { + "epoch": 0.12270494373410445, + "grad_norm": 0.7229950358350811, + "learning_rate": 6.768820372702473e-07, + "loss": 1.5317, + "step": 1761 + }, + { + "epoch": 0.12277462286172178, + "grad_norm": 0.6836420011968071, + "learning_rate": 6.768561007475743e-07, + "loss": 1.5504, + "step": 1762 + }, + { + "epoch": 0.1228443019893391, + "grad_norm": 0.729122942892889, + "learning_rate": 6.768301502385575e-07, + "loss": 1.544, + "step": 1763 + }, + { + "epoch": 0.12291398111695642, + "grad_norm": 0.7293140061308643, + "learning_rate": 6.768041857444408e-07, + "loss": 1.524, + "step": 1764 + }, + { + "epoch": 0.12298366024457374, + "grad_norm": 0.7587027647685078, + "learning_rate": 6.767782072664686e-07, + "loss": 1.6564, + "step": 1765 + }, + { + "epoch": 0.12305333937219105, + "grad_norm": 0.9828600493808015, + "learning_rate": 6.767522148058857e-07, + "loss": 1.7059, + "step": 1766 + }, + { + "epoch": 0.12312301849980838, + "grad_norm": 0.7534924840916932, + "learning_rate": 6.767262083639376e-07, + "loss": 1.5926, + "step": 1767 + }, + { + "epoch": 0.1231926976274257, + "grad_norm": 0.6925293310330989, + "learning_rate": 6.767001879418707e-07, + "loss": 1.6045, + "step": 1768 + }, + { + "epoch": 0.12326237675504302, + "grad_norm": 0.6414623823691792, + "learning_rate": 6.76674153540932e-07, + "loss": 1.458, + "step": 1769 + }, + { + "epoch": 0.12333205588266034, + "grad_norm": 0.6865414673148647, + "learning_rate": 6.766481051623689e-07, + "loss": 1.6308, + "step": 1770 + }, + { + "epoch": 0.12340173501027767, + "grad_norm": 0.7751892288650261, + "learning_rate": 6.766220428074302e-07, + "loss": 1.5195, + "step": 1771 + }, + { + "epoch": 0.12347141413789499, + "grad_norm": 0.757617472740173, + "learning_rate": 6.765959664773643e-07, + "loss": 1.5832, + "step": 1772 + }, + { + "epoch": 0.12354109326551231, + "grad_norm": 0.6845490688631454, + "learning_rate": 6.76569876173421e-07, + "loss": 1.4396, + "step": 1773 + }, + { + "epoch": 0.12361077239312963, + "grad_norm": 0.704065341495557, + "learning_rate": 6.765437718968508e-07, + "loss": 1.5786, + "step": 1774 + }, + { + "epoch": 0.12368045152074696, + "grad_norm": 0.7294914988615427, + "learning_rate": 6.765176536489044e-07, + "loss": 1.573, + "step": 1775 + }, + { + "epoch": 0.12375013064836428, + "grad_norm": 0.7297804629562296, + "learning_rate": 6.764915214308337e-07, + "loss": 1.4654, + "step": 1776 + }, + { + "epoch": 0.1238198097759816, + "grad_norm": 0.7017524980244302, + "learning_rate": 6.764653752438906e-07, + "loss": 1.6069, + "step": 1777 + }, + { + "epoch": 0.12388948890359892, + "grad_norm": 0.6905428204511646, + "learning_rate": 6.764392150893287e-07, + "loss": 1.5557, + "step": 1778 + }, + { + "epoch": 0.12395916803121625, + "grad_norm": 0.6926181291094297, + "learning_rate": 6.764130409684011e-07, + "loss": 1.6095, + "step": 1779 + }, + { + "epoch": 0.12402884715883357, + "grad_norm": 0.6707933571259431, + "learning_rate": 6.763868528823623e-07, + "loss": 1.4735, + "step": 1780 + }, + { + "epoch": 0.12409852628645089, + "grad_norm": 0.7107882072617481, + "learning_rate": 6.763606508324675e-07, + "loss": 1.5258, + "step": 1781 + }, + { + "epoch": 0.12416820541406821, + "grad_norm": 0.6739566888923207, + "learning_rate": 6.76334434819972e-07, + "loss": 1.4255, + "step": 1782 + }, + { + "epoch": 0.12423788454168554, + "grad_norm": 0.710791285317531, + "learning_rate": 6.763082048461322e-07, + "loss": 1.5633, + "step": 1783 + }, + { + "epoch": 0.12430756366930286, + "grad_norm": 0.7549679673837194, + "learning_rate": 6.762819609122052e-07, + "loss": 1.6048, + "step": 1784 + }, + { + "epoch": 0.12437724279692018, + "grad_norm": 0.6970338314567243, + "learning_rate": 6.762557030194489e-07, + "loss": 1.638, + "step": 1785 + }, + { + "epoch": 0.1244469219245375, + "grad_norm": 0.7329681304954031, + "learning_rate": 6.762294311691212e-07, + "loss": 1.6062, + "step": 1786 + }, + { + "epoch": 0.12451660105215483, + "grad_norm": 0.7518073218055812, + "learning_rate": 6.762031453624812e-07, + "loss": 1.5193, + "step": 1787 + }, + { + "epoch": 0.12458628017977215, + "grad_norm": 0.753125266754797, + "learning_rate": 6.761768456007888e-07, + "loss": 1.6375, + "step": 1788 + }, + { + "epoch": 0.12465595930738947, + "grad_norm": 0.7394329680594353, + "learning_rate": 6.76150531885304e-07, + "loss": 1.6436, + "step": 1789 + }, + { + "epoch": 0.1247256384350068, + "grad_norm": 0.6387955308763775, + "learning_rate": 6.761242042172882e-07, + "loss": 1.4788, + "step": 1790 + }, + { + "epoch": 0.12479531756262412, + "grad_norm": 0.7105153937432336, + "learning_rate": 6.760978625980027e-07, + "loss": 1.4578, + "step": 1791 + }, + { + "epoch": 0.12486499669024144, + "grad_norm": 0.7117812369399352, + "learning_rate": 6.760715070287101e-07, + "loss": 1.4701, + "step": 1792 + }, + { + "epoch": 0.12493467581785876, + "grad_norm": 0.6869069756247798, + "learning_rate": 6.760451375106733e-07, + "loss": 1.4755, + "step": 1793 + }, + { + "epoch": 0.12500435494547607, + "grad_norm": 0.6883046784262724, + "learning_rate": 6.76018754045156e-07, + "loss": 1.3836, + "step": 1794 + }, + { + "epoch": 0.1250740340730934, + "grad_norm": 0.7168411586962429, + "learning_rate": 6.759923566334225e-07, + "loss": 1.4577, + "step": 1795 + }, + { + "epoch": 0.12514371320071072, + "grad_norm": 0.753246753390711, + "learning_rate": 6.75965945276738e-07, + "loss": 1.4208, + "step": 1796 + }, + { + "epoch": 0.12521339232832804, + "grad_norm": 0.7194527123537333, + "learning_rate": 6.75939519976368e-07, + "loss": 1.5584, + "step": 1797 + }, + { + "epoch": 0.12528307145594536, + "grad_norm": 0.7148704507883208, + "learning_rate": 6.759130807335789e-07, + "loss": 1.5267, + "step": 1798 + }, + { + "epoch": 0.12535275058356268, + "grad_norm": 0.7706359160338018, + "learning_rate": 6.758866275496378e-07, + "loss": 1.622, + "step": 1799 + }, + { + "epoch": 0.12542242971118, + "grad_norm": 0.6968279687765971, + "learning_rate": 6.758601604258122e-07, + "loss": 1.5589, + "step": 1800 + }, + { + "epoch": 0.12549210883879733, + "grad_norm": 0.7075640465481938, + "learning_rate": 6.758336793633707e-07, + "loss": 1.5232, + "step": 1801 + }, + { + "epoch": 0.12556178796641465, + "grad_norm": 0.6986171636247963, + "learning_rate": 6.758071843635822e-07, + "loss": 1.5965, + "step": 1802 + }, + { + "epoch": 0.12563146709403197, + "grad_norm": 0.7080541368146622, + "learning_rate": 6.757806754277164e-07, + "loss": 1.6024, + "step": 1803 + }, + { + "epoch": 0.1257011462216493, + "grad_norm": 0.6954983639729273, + "learning_rate": 6.757541525570436e-07, + "loss": 1.4651, + "step": 1804 + }, + { + "epoch": 0.12577082534926662, + "grad_norm": 0.7513907877344518, + "learning_rate": 6.75727615752835e-07, + "loss": 1.6319, + "step": 1805 + }, + { + "epoch": 0.12584050447688394, + "grad_norm": 0.7377134006135804, + "learning_rate": 6.757010650163622e-07, + "loss": 1.4196, + "step": 1806 + }, + { + "epoch": 0.12591018360450126, + "grad_norm": 0.7379253766876244, + "learning_rate": 6.756745003488975e-07, + "loss": 1.6043, + "step": 1807 + }, + { + "epoch": 0.12597986273211859, + "grad_norm": 0.6780338111507962, + "learning_rate": 6.75647921751714e-07, + "loss": 1.5364, + "step": 1808 + }, + { + "epoch": 0.1260495418597359, + "grad_norm": 0.8363290539712473, + "learning_rate": 6.756213292260855e-07, + "loss": 1.4253, + "step": 1809 + }, + { + "epoch": 0.12611922098735323, + "grad_norm": 0.7171329233065877, + "learning_rate": 6.755947227732862e-07, + "loss": 1.6683, + "step": 1810 + }, + { + "epoch": 0.12618890011497055, + "grad_norm": 0.6956735267493139, + "learning_rate": 6.755681023945912e-07, + "loss": 1.5402, + "step": 1811 + }, + { + "epoch": 0.12625857924258788, + "grad_norm": 0.7265898886948043, + "learning_rate": 6.755414680912763e-07, + "loss": 1.5621, + "step": 1812 + }, + { + "epoch": 0.1263282583702052, + "grad_norm": 0.7832071974789533, + "learning_rate": 6.755148198646176e-07, + "loss": 1.6196, + "step": 1813 + }, + { + "epoch": 0.12639793749782252, + "grad_norm": 0.6980941771921907, + "learning_rate": 6.754881577158925e-07, + "loss": 1.469, + "step": 1814 + }, + { + "epoch": 0.12646761662543984, + "grad_norm": 0.7821845449425642, + "learning_rate": 6.754614816463783e-07, + "loss": 1.5984, + "step": 1815 + }, + { + "epoch": 0.12653729575305717, + "grad_norm": 0.7196655127524775, + "learning_rate": 6.754347916573539e-07, + "loss": 1.4687, + "step": 1816 + }, + { + "epoch": 0.1266069748806745, + "grad_norm": 0.7132709263239155, + "learning_rate": 6.754080877500978e-07, + "loss": 1.5156, + "step": 1817 + }, + { + "epoch": 0.1266766540082918, + "grad_norm": 0.7073112967452371, + "learning_rate": 6.7538136992589e-07, + "loss": 1.5509, + "step": 1818 + }, + { + "epoch": 0.12674633313590913, + "grad_norm": 0.7151991872285213, + "learning_rate": 6.753546381860108e-07, + "loss": 1.5542, + "step": 1819 + }, + { + "epoch": 0.12681601226352646, + "grad_norm": 0.7242367578277304, + "learning_rate": 6.753278925317413e-07, + "loss": 1.5261, + "step": 1820 + }, + { + "epoch": 0.12688569139114378, + "grad_norm": 0.6651665644113366, + "learning_rate": 6.753011329643631e-07, + "loss": 1.4353, + "step": 1821 + }, + { + "epoch": 0.1269553705187611, + "grad_norm": 0.7588221525761546, + "learning_rate": 6.752743594851586e-07, + "loss": 1.4917, + "step": 1822 + }, + { + "epoch": 0.12702504964637842, + "grad_norm": 0.7726293303712665, + "learning_rate": 6.75247572095411e-07, + "loss": 1.6404, + "step": 1823 + }, + { + "epoch": 0.12709472877399575, + "grad_norm": 0.7105867827958886, + "learning_rate": 6.752207707964037e-07, + "loss": 1.484, + "step": 1824 + }, + { + "epoch": 0.12716440790161307, + "grad_norm": 0.6953351793350567, + "learning_rate": 6.751939555894213e-07, + "loss": 1.531, + "step": 1825 + }, + { + "epoch": 0.1272340870292304, + "grad_norm": 0.6677024686574735, + "learning_rate": 6.75167126475749e-07, + "loss": 1.5333, + "step": 1826 + }, + { + "epoch": 0.12730376615684771, + "grad_norm": 0.7551124751250401, + "learning_rate": 6.751402834566721e-07, + "loss": 1.5653, + "step": 1827 + }, + { + "epoch": 0.12737344528446504, + "grad_norm": 0.7104248422758904, + "learning_rate": 6.751134265334772e-07, + "loss": 1.5667, + "step": 1828 + }, + { + "epoch": 0.12744312441208236, + "grad_norm": 0.6763604752203057, + "learning_rate": 6.750865557074514e-07, + "loss": 1.4499, + "step": 1829 + }, + { + "epoch": 0.12751280353969968, + "grad_norm": 0.6766323797495721, + "learning_rate": 6.750596709798822e-07, + "loss": 1.3995, + "step": 1830 + }, + { + "epoch": 0.127582482667317, + "grad_norm": 0.7736093226697133, + "learning_rate": 6.750327723520581e-07, + "loss": 1.5276, + "step": 1831 + }, + { + "epoch": 0.12765216179493433, + "grad_norm": 0.6719858842850682, + "learning_rate": 6.750058598252682e-07, + "loss": 1.4956, + "step": 1832 + }, + { + "epoch": 0.12772184092255165, + "grad_norm": 0.7346381580963557, + "learning_rate": 6.74978933400802e-07, + "loss": 1.5993, + "step": 1833 + }, + { + "epoch": 0.12779152005016897, + "grad_norm": 0.6808106123344357, + "learning_rate": 6.749519930799501e-07, + "loss": 1.5483, + "step": 1834 + }, + { + "epoch": 0.1278611991777863, + "grad_norm": 0.6727911376600302, + "learning_rate": 6.749250388640033e-07, + "loss": 1.5679, + "step": 1835 + }, + { + "epoch": 0.12793087830540362, + "grad_norm": 0.6908490917326462, + "learning_rate": 6.748980707542537e-07, + "loss": 1.4939, + "step": 1836 + }, + { + "epoch": 0.12800055743302094, + "grad_norm": 0.6991194309070923, + "learning_rate": 6.748710887519931e-07, + "loss": 1.5277, + "step": 1837 + }, + { + "epoch": 0.12807023656063826, + "grad_norm": 0.7535462584673673, + "learning_rate": 6.748440928585151e-07, + "loss": 1.5031, + "step": 1838 + }, + { + "epoch": 0.12813991568825558, + "grad_norm": 0.7257701505696328, + "learning_rate": 6.748170830751129e-07, + "loss": 1.6171, + "step": 1839 + }, + { + "epoch": 0.1282095948158729, + "grad_norm": 0.7197206514629374, + "learning_rate": 6.747900594030811e-07, + "loss": 1.5134, + "step": 1840 + }, + { + "epoch": 0.12827927394349023, + "grad_norm": 0.6863960162086155, + "learning_rate": 6.747630218437149e-07, + "loss": 1.6063, + "step": 1841 + }, + { + "epoch": 0.12834895307110755, + "grad_norm": 0.7451035417247105, + "learning_rate": 6.747359703983097e-07, + "loss": 1.5635, + "step": 1842 + }, + { + "epoch": 0.12841863219872487, + "grad_norm": 0.7056216450550601, + "learning_rate": 6.747089050681621e-07, + "loss": 1.4805, + "step": 1843 + }, + { + "epoch": 0.1284883113263422, + "grad_norm": 0.6961347423164467, + "learning_rate": 6.746818258545689e-07, + "loss": 1.4463, + "step": 1844 + }, + { + "epoch": 0.12855799045395952, + "grad_norm": 0.732709513808762, + "learning_rate": 6.746547327588279e-07, + "loss": 1.6429, + "step": 1845 + }, + { + "epoch": 0.12862766958157684, + "grad_norm": 0.7357288567627138, + "learning_rate": 6.746276257822375e-07, + "loss": 1.4988, + "step": 1846 + }, + { + "epoch": 0.12869734870919416, + "grad_norm": 0.6706891475934581, + "learning_rate": 6.746005049260967e-07, + "loss": 1.5309, + "step": 1847 + }, + { + "epoch": 0.1287670278368115, + "grad_norm": 0.7319525874852473, + "learning_rate": 6.745733701917052e-07, + "loss": 1.5278, + "step": 1848 + }, + { + "epoch": 0.1288367069644288, + "grad_norm": 0.6639104121057977, + "learning_rate": 6.745462215803632e-07, + "loss": 1.3677, + "step": 1849 + }, + { + "epoch": 0.12890638609204613, + "grad_norm": 0.7085480744877006, + "learning_rate": 6.745190590933719e-07, + "loss": 1.518, + "step": 1850 + }, + { + "epoch": 0.12897606521966345, + "grad_norm": 0.7505374113469124, + "learning_rate": 6.744918827320328e-07, + "loss": 1.5333, + "step": 1851 + }, + { + "epoch": 0.12904574434728078, + "grad_norm": 0.7352795678485466, + "learning_rate": 6.744646924976485e-07, + "loss": 1.5475, + "step": 1852 + }, + { + "epoch": 0.1291154234748981, + "grad_norm": 0.7035551100291901, + "learning_rate": 6.744374883915218e-07, + "loss": 1.5256, + "step": 1853 + }, + { + "epoch": 0.12918510260251542, + "grad_norm": 0.7153574641045704, + "learning_rate": 6.744102704149565e-07, + "loss": 1.5735, + "step": 1854 + }, + { + "epoch": 0.12925478173013275, + "grad_norm": 0.6577428673649858, + "learning_rate": 6.743830385692569e-07, + "loss": 1.4839, + "step": 1855 + }, + { + "epoch": 0.12932446085775007, + "grad_norm": 0.7310828390027627, + "learning_rate": 6.743557928557279e-07, + "loss": 1.5818, + "step": 1856 + }, + { + "epoch": 0.1293941399853674, + "grad_norm": 0.6714264380020178, + "learning_rate": 6.743285332756753e-07, + "loss": 1.4134, + "step": 1857 + }, + { + "epoch": 0.1294638191129847, + "grad_norm": 0.7398456859492963, + "learning_rate": 6.743012598304055e-07, + "loss": 1.5288, + "step": 1858 + }, + { + "epoch": 0.12953349824060204, + "grad_norm": 0.7655971284671793, + "learning_rate": 6.742739725212255e-07, + "loss": 1.6144, + "step": 1859 + }, + { + "epoch": 0.12960317736821936, + "grad_norm": 0.7045382170108533, + "learning_rate": 6.742466713494427e-07, + "loss": 1.5161, + "step": 1860 + }, + { + "epoch": 0.12967285649583668, + "grad_norm": 0.7037565360640923, + "learning_rate": 6.742193563163656e-07, + "loss": 1.5649, + "step": 1861 + }, + { + "epoch": 0.129742535623454, + "grad_norm": 0.6883733361202018, + "learning_rate": 6.741920274233033e-07, + "loss": 1.5637, + "step": 1862 + }, + { + "epoch": 0.12981221475107133, + "grad_norm": 0.7285700573399307, + "learning_rate": 6.741646846715651e-07, + "loss": 1.6719, + "step": 1863 + }, + { + "epoch": 0.12988189387868865, + "grad_norm": 0.7301447510701851, + "learning_rate": 6.741373280624618e-07, + "loss": 1.5617, + "step": 1864 + }, + { + "epoch": 0.12995157300630597, + "grad_norm": 0.7031543215374925, + "learning_rate": 6.741099575973041e-07, + "loss": 1.4602, + "step": 1865 + }, + { + "epoch": 0.1300212521339233, + "grad_norm": 0.7094167448128131, + "learning_rate": 6.740825732774036e-07, + "loss": 1.616, + "step": 1866 + }, + { + "epoch": 0.13009093126154062, + "grad_norm": 0.7651565895226398, + "learning_rate": 6.740551751040729e-07, + "loss": 1.5392, + "step": 1867 + }, + { + "epoch": 0.13016061038915794, + "grad_norm": 0.7735065918268992, + "learning_rate": 6.740277630786246e-07, + "loss": 1.5778, + "step": 1868 + }, + { + "epoch": 0.13023028951677526, + "grad_norm": 0.7967705631098297, + "learning_rate": 6.740003372023727e-07, + "loss": 1.7259, + "step": 1869 + }, + { + "epoch": 0.13029996864439258, + "grad_norm": 0.6792582135508917, + "learning_rate": 6.739728974766312e-07, + "loss": 1.6366, + "step": 1870 + }, + { + "epoch": 0.1303696477720099, + "grad_norm": 0.730703176577263, + "learning_rate": 6.739454439027153e-07, + "loss": 1.5665, + "step": 1871 + }, + { + "epoch": 0.13043932689962723, + "grad_norm": 0.6920045189349746, + "learning_rate": 6.739179764819405e-07, + "loss": 1.5654, + "step": 1872 + }, + { + "epoch": 0.13050900602724455, + "grad_norm": 0.7136204029101176, + "learning_rate": 6.738904952156231e-07, + "loss": 1.5337, + "step": 1873 + }, + { + "epoch": 0.13057868515486187, + "grad_norm": 0.6619804892238528, + "learning_rate": 6.738630001050801e-07, + "loss": 1.5429, + "step": 1874 + }, + { + "epoch": 0.1306483642824792, + "grad_norm": 0.7257253083228685, + "learning_rate": 6.738354911516292e-07, + "loss": 1.5279, + "step": 1875 + }, + { + "epoch": 0.13071804341009652, + "grad_norm": 0.7053782168665625, + "learning_rate": 6.738079683565885e-07, + "loss": 1.5864, + "step": 1876 + }, + { + "epoch": 0.13078772253771384, + "grad_norm": 0.763390428555883, + "learning_rate": 6.73780431721277e-07, + "loss": 1.5123, + "step": 1877 + }, + { + "epoch": 0.13085740166533116, + "grad_norm": 0.7091548271863509, + "learning_rate": 6.737528812470145e-07, + "loss": 1.4875, + "step": 1878 + }, + { + "epoch": 0.13092708079294849, + "grad_norm": 0.705405057687693, + "learning_rate": 6.737253169351209e-07, + "loss": 1.4578, + "step": 1879 + }, + { + "epoch": 0.13099675992056578, + "grad_norm": 0.6716338179893901, + "learning_rate": 6.736977387869176e-07, + "loss": 1.4275, + "step": 1880 + }, + { + "epoch": 0.1310664390481831, + "grad_norm": 0.6687999558803702, + "learning_rate": 6.736701468037259e-07, + "loss": 1.4048, + "step": 1881 + }, + { + "epoch": 0.13113611817580043, + "grad_norm": 0.7167984046454882, + "learning_rate": 6.736425409868682e-07, + "loss": 1.7738, + "step": 1882 + }, + { + "epoch": 0.13120579730341775, + "grad_norm": 0.730282334768594, + "learning_rate": 6.736149213376672e-07, + "loss": 1.5536, + "step": 1883 + }, + { + "epoch": 0.13127547643103507, + "grad_norm": 0.7103695925549198, + "learning_rate": 6.735872878574467e-07, + "loss": 1.553, + "step": 1884 + }, + { + "epoch": 0.1313451555586524, + "grad_norm": 0.6854489314675175, + "learning_rate": 6.73559640547531e-07, + "loss": 1.5653, + "step": 1885 + }, + { + "epoch": 0.13141483468626972, + "grad_norm": 0.7505672021974592, + "learning_rate": 6.735319794092449e-07, + "loss": 1.5383, + "step": 1886 + }, + { + "epoch": 0.13148451381388704, + "grad_norm": 0.732423515927833, + "learning_rate": 6.73504304443914e-07, + "loss": 1.4525, + "step": 1887 + }, + { + "epoch": 0.13155419294150436, + "grad_norm": 0.685880905255038, + "learning_rate": 6.734766156528645e-07, + "loss": 1.616, + "step": 1888 + }, + { + "epoch": 0.13162387206912168, + "grad_norm": 0.6960458179247873, + "learning_rate": 6.734489130374234e-07, + "loss": 1.5326, + "step": 1889 + }, + { + "epoch": 0.131693551196739, + "grad_norm": 0.6866922904575083, + "learning_rate": 6.734211965989182e-07, + "loss": 1.6037, + "step": 1890 + }, + { + "epoch": 0.13176323032435633, + "grad_norm": 0.673778268440158, + "learning_rate": 6.73393466338677e-07, + "loss": 1.535, + "step": 1891 + }, + { + "epoch": 0.13183290945197365, + "grad_norm": 0.690886200558052, + "learning_rate": 6.73365722258029e-07, + "loss": 1.4375, + "step": 1892 + }, + { + "epoch": 0.13190258857959097, + "grad_norm": 0.6994952752280379, + "learning_rate": 6.733379643583036e-07, + "loss": 1.5044, + "step": 1893 + }, + { + "epoch": 0.1319722677072083, + "grad_norm": 0.7264263810645032, + "learning_rate": 6.733101926408308e-07, + "loss": 1.6547, + "step": 1894 + }, + { + "epoch": 0.13204194683482562, + "grad_norm": 0.7103588905968264, + "learning_rate": 6.732824071069419e-07, + "loss": 1.5439, + "step": 1895 + }, + { + "epoch": 0.13211162596244294, + "grad_norm": 0.672222207099807, + "learning_rate": 6.732546077579681e-07, + "loss": 1.491, + "step": 1896 + }, + { + "epoch": 0.13218130509006026, + "grad_norm": 0.751933233368762, + "learning_rate": 6.732267945952418e-07, + "loss": 1.6748, + "step": 1897 + }, + { + "epoch": 0.1322509842176776, + "grad_norm": 0.7135892877523559, + "learning_rate": 6.731989676200958e-07, + "loss": 1.6594, + "step": 1898 + }, + { + "epoch": 0.1323206633452949, + "grad_norm": 0.6841008584696798, + "learning_rate": 6.731711268338635e-07, + "loss": 1.555, + "step": 1899 + }, + { + "epoch": 0.13239034247291223, + "grad_norm": 0.7548485821820744, + "learning_rate": 6.731432722378794e-07, + "loss": 1.5304, + "step": 1900 + }, + { + "epoch": 0.13246002160052955, + "grad_norm": 0.7137250878688367, + "learning_rate": 6.73115403833478e-07, + "loss": 1.4984, + "step": 1901 + }, + { + "epoch": 0.13252970072814688, + "grad_norm": 0.6812981296452358, + "learning_rate": 6.730875216219948e-07, + "loss": 1.5644, + "step": 1902 + }, + { + "epoch": 0.1325993798557642, + "grad_norm": 0.7216405903372481, + "learning_rate": 6.730596256047663e-07, + "loss": 1.5853, + "step": 1903 + }, + { + "epoch": 0.13266905898338152, + "grad_norm": 0.7696458881454737, + "learning_rate": 6.730317157831293e-07, + "loss": 1.5555, + "step": 1904 + }, + { + "epoch": 0.13273873811099884, + "grad_norm": 0.7513839173466563, + "learning_rate": 6.730037921584209e-07, + "loss": 1.5812, + "step": 1905 + }, + { + "epoch": 0.13280841723861617, + "grad_norm": 0.7038080392696825, + "learning_rate": 6.729758547319796e-07, + "loss": 1.5747, + "step": 1906 + }, + { + "epoch": 0.1328780963662335, + "grad_norm": 0.7021482922132364, + "learning_rate": 6.729479035051443e-07, + "loss": 1.6157, + "step": 1907 + }, + { + "epoch": 0.1329477754938508, + "grad_norm": 0.7128968760411135, + "learning_rate": 6.729199384792542e-07, + "loss": 1.5565, + "step": 1908 + }, + { + "epoch": 0.13301745462146813, + "grad_norm": 0.7591474416194975, + "learning_rate": 6.728919596556496e-07, + "loss": 1.5124, + "step": 1909 + }, + { + "epoch": 0.13308713374908546, + "grad_norm": 0.731997601252689, + "learning_rate": 6.728639670356711e-07, + "loss": 1.5297, + "step": 1910 + }, + { + "epoch": 0.13315681287670278, + "grad_norm": 0.7360419831946642, + "learning_rate": 6.728359606206605e-07, + "loss": 1.7696, + "step": 1911 + }, + { + "epoch": 0.1332264920043201, + "grad_norm": 0.7143694573394039, + "learning_rate": 6.728079404119597e-07, + "loss": 1.4961, + "step": 1912 + }, + { + "epoch": 0.13329617113193742, + "grad_norm": 0.6871856694114608, + "learning_rate": 6.727799064109116e-07, + "loss": 1.5909, + "step": 1913 + }, + { + "epoch": 0.13336585025955475, + "grad_norm": 0.7120372611141601, + "learning_rate": 6.727518586188593e-07, + "loss": 1.6738, + "step": 1914 + }, + { + "epoch": 0.13343552938717207, + "grad_norm": 0.7250648448379915, + "learning_rate": 6.727237970371475e-07, + "loss": 1.4843, + "step": 1915 + }, + { + "epoch": 0.1335052085147894, + "grad_norm": 0.7316783552643227, + "learning_rate": 6.726957216671206e-07, + "loss": 1.537, + "step": 1916 + }, + { + "epoch": 0.13357488764240671, + "grad_norm": 0.7110209139763414, + "learning_rate": 6.72667632510124e-07, + "loss": 1.5645, + "step": 1917 + }, + { + "epoch": 0.13364456677002404, + "grad_norm": 0.7095671591176976, + "learning_rate": 6.72639529567504e-07, + "loss": 1.5458, + "step": 1918 + }, + { + "epoch": 0.13371424589764136, + "grad_norm": 0.7720388159693496, + "learning_rate": 6.726114128406072e-07, + "loss": 1.4622, + "step": 1919 + }, + { + "epoch": 0.13378392502525868, + "grad_norm": 0.7088699128516145, + "learning_rate": 6.72583282330781e-07, + "loss": 1.476, + "step": 1920 + }, + { + "epoch": 0.133853604152876, + "grad_norm": 0.6908728730889724, + "learning_rate": 6.725551380393735e-07, + "loss": 1.5533, + "step": 1921 + }, + { + "epoch": 0.13392328328049333, + "grad_norm": 0.6820942269733361, + "learning_rate": 6.725269799677335e-07, + "loss": 1.5603, + "step": 1922 + }, + { + "epoch": 0.13399296240811065, + "grad_norm": 0.6923947732216261, + "learning_rate": 6.724988081172102e-07, + "loss": 1.5315, + "step": 1923 + }, + { + "epoch": 0.13406264153572797, + "grad_norm": 0.7251093460202046, + "learning_rate": 6.72470622489154e-07, + "loss": 1.5488, + "step": 1924 + }, + { + "epoch": 0.1341323206633453, + "grad_norm": 0.7012512826060395, + "learning_rate": 6.724424230849153e-07, + "loss": 1.4948, + "step": 1925 + }, + { + "epoch": 0.13420199979096262, + "grad_norm": 0.7212684831182445, + "learning_rate": 6.724142099058455e-07, + "loss": 1.56, + "step": 1926 + }, + { + "epoch": 0.13427167891857994, + "grad_norm": 0.7048665231478228, + "learning_rate": 6.723859829532968e-07, + "loss": 1.5274, + "step": 1927 + }, + { + "epoch": 0.13434135804619726, + "grad_norm": 0.7718463262225225, + "learning_rate": 6.723577422286217e-07, + "loss": 1.7148, + "step": 1928 + }, + { + "epoch": 0.13441103717381458, + "grad_norm": 0.6792581126537035, + "learning_rate": 6.723294877331739e-07, + "loss": 1.4344, + "step": 1929 + }, + { + "epoch": 0.1344807163014319, + "grad_norm": 0.7081966504199131, + "learning_rate": 6.723012194683071e-07, + "loss": 1.5669, + "step": 1930 + }, + { + "epoch": 0.13455039542904923, + "grad_norm": 0.7534230117240622, + "learning_rate": 6.722729374353759e-07, + "loss": 1.4576, + "step": 1931 + }, + { + "epoch": 0.13462007455666655, + "grad_norm": 0.6962594541305029, + "learning_rate": 6.722446416357359e-07, + "loss": 1.5744, + "step": 1932 + }, + { + "epoch": 0.13468975368428387, + "grad_norm": 0.7580060974065307, + "learning_rate": 6.722163320707429e-07, + "loss": 1.6539, + "step": 1933 + }, + { + "epoch": 0.1347594328119012, + "grad_norm": 0.673830372373551, + "learning_rate": 6.721880087417536e-07, + "loss": 1.4916, + "step": 1934 + }, + { + "epoch": 0.13482911193951852, + "grad_norm": 0.7218253612280532, + "learning_rate": 6.721596716501253e-07, + "loss": 1.4981, + "step": 1935 + }, + { + "epoch": 0.13489879106713584, + "grad_norm": 0.7531583849102892, + "learning_rate": 6.721313207972162e-07, + "loss": 1.6348, + "step": 1936 + }, + { + "epoch": 0.13496847019475317, + "grad_norm": 0.7429401300806515, + "learning_rate": 6.721029561843847e-07, + "loss": 1.6212, + "step": 1937 + }, + { + "epoch": 0.1350381493223705, + "grad_norm": 0.6828835052353207, + "learning_rate": 6.720745778129899e-07, + "loss": 1.3943, + "step": 1938 + }, + { + "epoch": 0.1351078284499878, + "grad_norm": 0.7085091052785595, + "learning_rate": 6.720461856843922e-07, + "loss": 1.5354, + "step": 1939 + }, + { + "epoch": 0.13517750757760513, + "grad_norm": 0.7364242437633206, + "learning_rate": 6.720177797999519e-07, + "loss": 1.7678, + "step": 1940 + }, + { + "epoch": 0.13524718670522246, + "grad_norm": 0.6570866689949861, + "learning_rate": 6.719893601610304e-07, + "loss": 1.5072, + "step": 1941 + }, + { + "epoch": 0.13531686583283978, + "grad_norm": 0.7943585246747022, + "learning_rate": 6.719609267689896e-07, + "loss": 1.6721, + "step": 1942 + }, + { + "epoch": 0.1353865449604571, + "grad_norm": 0.7188461240489994, + "learning_rate": 6.71932479625192e-07, + "loss": 1.5127, + "step": 1943 + }, + { + "epoch": 0.13545622408807442, + "grad_norm": 0.7475198285216123, + "learning_rate": 6.719040187310009e-07, + "loss": 1.6445, + "step": 1944 + }, + { + "epoch": 0.13552590321569175, + "grad_norm": 0.7258450468992407, + "learning_rate": 6.718755440877802e-07, + "loss": 1.5219, + "step": 1945 + }, + { + "epoch": 0.13559558234330907, + "grad_norm": 0.7468264368633342, + "learning_rate": 6.718470556968946e-07, + "loss": 1.5828, + "step": 1946 + }, + { + "epoch": 0.1356652614709264, + "grad_norm": 0.7125015542157345, + "learning_rate": 6.718185535597091e-07, + "loss": 1.5189, + "step": 1947 + }, + { + "epoch": 0.1357349405985437, + "grad_norm": 0.7036505846596398, + "learning_rate": 6.717900376775899e-07, + "loss": 1.5386, + "step": 1948 + }, + { + "epoch": 0.13580461972616104, + "grad_norm": 0.7382895590391404, + "learning_rate": 6.71761508051903e-07, + "loss": 1.5655, + "step": 1949 + }, + { + "epoch": 0.13587429885377836, + "grad_norm": 0.7511916226599143, + "learning_rate": 6.717329646840162e-07, + "loss": 1.6689, + "step": 1950 + }, + { + "epoch": 0.13594397798139568, + "grad_norm": 0.7217544848044565, + "learning_rate": 6.717044075752969e-07, + "loss": 1.5589, + "step": 1951 + }, + { + "epoch": 0.136013657109013, + "grad_norm": 0.6802227867126514, + "learning_rate": 6.716758367271138e-07, + "loss": 1.4658, + "step": 1952 + }, + { + "epoch": 0.13608333623663033, + "grad_norm": 0.6972733413609539, + "learning_rate": 6.716472521408362e-07, + "loss": 1.5688, + "step": 1953 + }, + { + "epoch": 0.13615301536424765, + "grad_norm": 0.7289731126070428, + "learning_rate": 6.716186538178338e-07, + "loss": 1.572, + "step": 1954 + }, + { + "epoch": 0.13622269449186497, + "grad_norm": 0.7096236736018127, + "learning_rate": 6.715900417594769e-07, + "loss": 1.5759, + "step": 1955 + }, + { + "epoch": 0.1362923736194823, + "grad_norm": 0.7300802470602689, + "learning_rate": 6.71561415967137e-07, + "loss": 1.5944, + "step": 1956 + }, + { + "epoch": 0.13636205274709962, + "grad_norm": 0.7424757456231104, + "learning_rate": 6.715327764421858e-07, + "loss": 1.5659, + "step": 1957 + }, + { + "epoch": 0.13643173187471694, + "grad_norm": 0.6961374129664488, + "learning_rate": 6.715041231859956e-07, + "loss": 1.6426, + "step": 1958 + }, + { + "epoch": 0.13650141100233426, + "grad_norm": 0.7139600080889801, + "learning_rate": 6.714754561999395e-07, + "loss": 1.4646, + "step": 1959 + }, + { + "epoch": 0.13657109012995158, + "grad_norm": 0.737591170989514, + "learning_rate": 6.714467754853917e-07, + "loss": 1.5856, + "step": 1960 + }, + { + "epoch": 0.1366407692575689, + "grad_norm": 0.7319158356469768, + "learning_rate": 6.714180810437263e-07, + "loss": 1.5179, + "step": 1961 + }, + { + "epoch": 0.13671044838518623, + "grad_norm": 0.8217089235096635, + "learning_rate": 6.713893728763184e-07, + "loss": 1.575, + "step": 1962 + }, + { + "epoch": 0.13678012751280355, + "grad_norm": 0.7349260888101283, + "learning_rate": 6.713606509845437e-07, + "loss": 1.4672, + "step": 1963 + }, + { + "epoch": 0.13684980664042087, + "grad_norm": 0.6870022968581666, + "learning_rate": 6.713319153697788e-07, + "loss": 1.611, + "step": 1964 + }, + { + "epoch": 0.1369194857680382, + "grad_norm": 0.6992649168712028, + "learning_rate": 6.713031660334007e-07, + "loss": 1.5757, + "step": 1965 + }, + { + "epoch": 0.13698916489565552, + "grad_norm": 0.7178246262660336, + "learning_rate": 6.712744029767871e-07, + "loss": 1.4761, + "step": 1966 + }, + { + "epoch": 0.13705884402327284, + "grad_norm": 0.7437799778592202, + "learning_rate": 6.712456262013164e-07, + "loss": 1.5656, + "step": 1967 + }, + { + "epoch": 0.13712852315089016, + "grad_norm": 0.7408121743553396, + "learning_rate": 6.712168357083677e-07, + "loss": 1.4855, + "step": 1968 + }, + { + "epoch": 0.1371982022785075, + "grad_norm": 0.7172543395443857, + "learning_rate": 6.711880314993205e-07, + "loss": 1.4648, + "step": 1969 + }, + { + "epoch": 0.1372678814061248, + "grad_norm": 0.7059469068330578, + "learning_rate": 6.711592135755555e-07, + "loss": 1.5729, + "step": 1970 + }, + { + "epoch": 0.13733756053374213, + "grad_norm": 0.7208503647318539, + "learning_rate": 6.711303819384533e-07, + "loss": 1.5759, + "step": 1971 + }, + { + "epoch": 0.13740723966135943, + "grad_norm": 0.7529947820776219, + "learning_rate": 6.711015365893959e-07, + "loss": 1.5245, + "step": 1972 + }, + { + "epoch": 0.13747691878897675, + "grad_norm": 0.7036672482791874, + "learning_rate": 6.710726775297655e-07, + "loss": 1.5135, + "step": 1973 + }, + { + "epoch": 0.13754659791659407, + "grad_norm": 0.7944363990655092, + "learning_rate": 6.710438047609452e-07, + "loss": 1.6168, + "step": 1974 + }, + { + "epoch": 0.1376162770442114, + "grad_norm": 0.7280065937367366, + "learning_rate": 6.710149182843183e-07, + "loss": 1.4074, + "step": 1975 + }, + { + "epoch": 0.13768595617182872, + "grad_norm": 0.727312215451071, + "learning_rate": 6.709860181012695e-07, + "loss": 1.6429, + "step": 1976 + }, + { + "epoch": 0.13775563529944604, + "grad_norm": 0.6953250639382146, + "learning_rate": 6.709571042131836e-07, + "loss": 1.596, + "step": 1977 + }, + { + "epoch": 0.13782531442706336, + "grad_norm": 0.7968221935128836, + "learning_rate": 6.709281766214462e-07, + "loss": 1.4812, + "step": 1978 + }, + { + "epoch": 0.13789499355468068, + "grad_norm": 0.731561061631931, + "learning_rate": 6.708992353274434e-07, + "loss": 1.5383, + "step": 1979 + }, + { + "epoch": 0.137964672682298, + "grad_norm": 0.697983322450153, + "learning_rate": 6.708702803325626e-07, + "loss": 1.4938, + "step": 1980 + }, + { + "epoch": 0.13803435180991533, + "grad_norm": 0.6999587134752484, + "learning_rate": 6.70841311638191e-07, + "loss": 1.5703, + "step": 1981 + }, + { + "epoch": 0.13810403093753265, + "grad_norm": 0.7522221971120099, + "learning_rate": 6.708123292457168e-07, + "loss": 1.4194, + "step": 1982 + }, + { + "epoch": 0.13817371006514997, + "grad_norm": 0.7996483404297469, + "learning_rate": 6.707833331565289e-07, + "loss": 1.3953, + "step": 1983 + }, + { + "epoch": 0.1382433891927673, + "grad_norm": 0.7338395677152775, + "learning_rate": 6.707543233720173e-07, + "loss": 1.6296, + "step": 1984 + }, + { + "epoch": 0.13831306832038462, + "grad_norm": 0.7426771851411896, + "learning_rate": 6.707252998935717e-07, + "loss": 1.5608, + "step": 1985 + }, + { + "epoch": 0.13838274744800194, + "grad_norm": 0.6530700467546554, + "learning_rate": 6.706962627225833e-07, + "loss": 1.5599, + "step": 1986 + }, + { + "epoch": 0.13845242657561926, + "grad_norm": 0.777687245902078, + "learning_rate": 6.706672118604433e-07, + "loss": 1.5721, + "step": 1987 + }, + { + "epoch": 0.1385221057032366, + "grad_norm": 0.6768719388702751, + "learning_rate": 6.706381473085441e-07, + "loss": 1.5406, + "step": 1988 + }, + { + "epoch": 0.1385917848308539, + "grad_norm": 0.7222488532422539, + "learning_rate": 6.706090690682784e-07, + "loss": 1.4633, + "step": 1989 + }, + { + "epoch": 0.13866146395847123, + "grad_norm": 0.723866634938114, + "learning_rate": 6.705799771410399e-07, + "loss": 1.5605, + "step": 1990 + }, + { + "epoch": 0.13873114308608855, + "grad_norm": 0.7870541821766887, + "learning_rate": 6.705508715282225e-07, + "loss": 1.5404, + "step": 1991 + }, + { + "epoch": 0.13880082221370588, + "grad_norm": 0.7334998345863415, + "learning_rate": 6.705217522312213e-07, + "loss": 1.6006, + "step": 1992 + }, + { + "epoch": 0.1388705013413232, + "grad_norm": 0.7960625479446524, + "learning_rate": 6.704926192514313e-07, + "loss": 1.6352, + "step": 1993 + }, + { + "epoch": 0.13894018046894052, + "grad_norm": 0.7140325267209116, + "learning_rate": 6.70463472590249e-07, + "loss": 1.5576, + "step": 1994 + }, + { + "epoch": 0.13900985959655784, + "grad_norm": 0.6927480097785106, + "learning_rate": 6.70434312249071e-07, + "loss": 1.4991, + "step": 1995 + }, + { + "epoch": 0.13907953872417517, + "grad_norm": 0.6642520213604098, + "learning_rate": 6.70405138229295e-07, + "loss": 1.4621, + "step": 1996 + }, + { + "epoch": 0.1391492178517925, + "grad_norm": 0.6805635598441102, + "learning_rate": 6.703759505323186e-07, + "loss": 1.6133, + "step": 1997 + }, + { + "epoch": 0.1392188969794098, + "grad_norm": 0.7558783219492097, + "learning_rate": 6.703467491595409e-07, + "loss": 1.6302, + "step": 1998 + }, + { + "epoch": 0.13928857610702713, + "grad_norm": 0.7137225258702494, + "learning_rate": 6.703175341123611e-07, + "loss": 1.546, + "step": 1999 + }, + { + "epoch": 0.13935825523464446, + "grad_norm": 0.6896011638861811, + "learning_rate": 6.702883053921793e-07, + "loss": 1.5004, + "step": 2000 + }, + { + "epoch": 0.13942793436226178, + "grad_norm": 0.7740936820333882, + "learning_rate": 6.702590630003963e-07, + "loss": 1.5578, + "step": 2001 + }, + { + "epoch": 0.1394976134898791, + "grad_norm": 0.7113997941083866, + "learning_rate": 6.702298069384134e-07, + "loss": 1.5332, + "step": 2002 + }, + { + "epoch": 0.13956729261749642, + "grad_norm": 0.6916645083605428, + "learning_rate": 6.702005372076325e-07, + "loss": 1.7124, + "step": 2003 + }, + { + "epoch": 0.13963697174511375, + "grad_norm": 0.8047028801866378, + "learning_rate": 6.701712538094564e-07, + "loss": 1.5595, + "step": 2004 + }, + { + "epoch": 0.13970665087273107, + "grad_norm": 0.6812523150401025, + "learning_rate": 6.701419567452884e-07, + "loss": 1.4507, + "step": 2005 + }, + { + "epoch": 0.1397763300003484, + "grad_norm": 0.7103344035535895, + "learning_rate": 6.701126460165324e-07, + "loss": 1.5772, + "step": 2006 + }, + { + "epoch": 0.13984600912796571, + "grad_norm": 0.7102780550548473, + "learning_rate": 6.700833216245931e-07, + "loss": 1.5876, + "step": 2007 + }, + { + "epoch": 0.13991568825558304, + "grad_norm": 0.7369837252532143, + "learning_rate": 6.700539835708757e-07, + "loss": 1.5054, + "step": 2008 + }, + { + "epoch": 0.13998536738320036, + "grad_norm": 0.74091795158049, + "learning_rate": 6.700246318567862e-07, + "loss": 1.547, + "step": 2009 + }, + { + "epoch": 0.14005504651081768, + "grad_norm": 0.8230971961676586, + "learning_rate": 6.699952664837312e-07, + "loss": 1.6581, + "step": 2010 + }, + { + "epoch": 0.140124725638435, + "grad_norm": 0.6696729949760143, + "learning_rate": 6.699658874531181e-07, + "loss": 1.5002, + "step": 2011 + }, + { + "epoch": 0.14019440476605233, + "grad_norm": 0.6628325796270378, + "learning_rate": 6.699364947663546e-07, + "loss": 1.5461, + "step": 2012 + }, + { + "epoch": 0.14026408389366965, + "grad_norm": 0.7288767149466291, + "learning_rate": 6.699070884248492e-07, + "loss": 1.674, + "step": 2013 + }, + { + "epoch": 0.14033376302128697, + "grad_norm": 0.6900581440512029, + "learning_rate": 6.698776684300113e-07, + "loss": 1.539, + "step": 2014 + }, + { + "epoch": 0.1404034421489043, + "grad_norm": 0.7013138547776667, + "learning_rate": 6.698482347832506e-07, + "loss": 1.519, + "step": 2015 + }, + { + "epoch": 0.14047312127652162, + "grad_norm": 0.7269407616493614, + "learning_rate": 6.698187874859778e-07, + "loss": 1.4797, + "step": 2016 + }, + { + "epoch": 0.14054280040413894, + "grad_norm": 0.7810767647483735, + "learning_rate": 6.69789326539604e-07, + "loss": 1.6596, + "step": 2017 + }, + { + "epoch": 0.14061247953175626, + "grad_norm": 0.7286918267880533, + "learning_rate": 6.697598519455409e-07, + "loss": 1.4667, + "step": 2018 + }, + { + "epoch": 0.14068215865937359, + "grad_norm": 0.6977287876601965, + "learning_rate": 6.697303637052011e-07, + "loss": 1.3941, + "step": 2019 + }, + { + "epoch": 0.1407518377869909, + "grad_norm": 0.6633872121941339, + "learning_rate": 6.697008618199978e-07, + "loss": 1.4926, + "step": 2020 + }, + { + "epoch": 0.14082151691460823, + "grad_norm": 0.6908639722598806, + "learning_rate": 6.696713462913447e-07, + "loss": 1.5853, + "step": 2021 + }, + { + "epoch": 0.14089119604222555, + "grad_norm": 0.7069874421515819, + "learning_rate": 6.696418171206563e-07, + "loss": 1.4058, + "step": 2022 + }, + { + "epoch": 0.14096087516984288, + "grad_norm": 0.7027241913436704, + "learning_rate": 6.696122743093476e-07, + "loss": 1.5894, + "step": 2023 + }, + { + "epoch": 0.1410305542974602, + "grad_norm": 0.7560903773846185, + "learning_rate": 6.695827178588346e-07, + "loss": 1.6811, + "step": 2024 + }, + { + "epoch": 0.14110023342507752, + "grad_norm": 0.7511736752192385, + "learning_rate": 6.695531477705333e-07, + "loss": 1.6184, + "step": 2025 + }, + { + "epoch": 0.14116991255269484, + "grad_norm": 0.6690392393806831, + "learning_rate": 6.69523564045861e-07, + "loss": 1.4891, + "step": 2026 + }, + { + "epoch": 0.14123959168031217, + "grad_norm": 0.6912196783448034, + "learning_rate": 6.694939666862355e-07, + "loss": 1.4651, + "step": 2027 + }, + { + "epoch": 0.1413092708079295, + "grad_norm": 0.77137577829856, + "learning_rate": 6.69464355693075e-07, + "loss": 1.5403, + "step": 2028 + }, + { + "epoch": 0.1413789499355468, + "grad_norm": 0.6865381749824687, + "learning_rate": 6.694347310677985e-07, + "loss": 1.6508, + "step": 2029 + }, + { + "epoch": 0.14144862906316413, + "grad_norm": 0.7566844700374754, + "learning_rate": 6.69405092811826e-07, + "loss": 1.5167, + "step": 2030 + }, + { + "epoch": 0.14151830819078146, + "grad_norm": 0.6952455915837616, + "learning_rate": 6.693754409265774e-07, + "loss": 1.4989, + "step": 2031 + }, + { + "epoch": 0.14158798731839878, + "grad_norm": 0.7130424330099704, + "learning_rate": 6.693457754134739e-07, + "loss": 1.4692, + "step": 2032 + }, + { + "epoch": 0.1416576664460161, + "grad_norm": 0.8873037536282933, + "learning_rate": 6.693160962739372e-07, + "loss": 1.4678, + "step": 2033 + }, + { + "epoch": 0.14172734557363342, + "grad_norm": 0.7036011332421183, + "learning_rate": 6.692864035093892e-07, + "loss": 1.5706, + "step": 2034 + }, + { + "epoch": 0.14179702470125075, + "grad_norm": 0.834179522399948, + "learning_rate": 6.692566971212533e-07, + "loss": 1.5409, + "step": 2035 + }, + { + "epoch": 0.14186670382886807, + "grad_norm": 0.6949574416710086, + "learning_rate": 6.69226977110953e-07, + "loss": 1.5529, + "step": 2036 + }, + { + "epoch": 0.1419363829564854, + "grad_norm": 0.7135623739474563, + "learning_rate": 6.691972434799122e-07, + "loss": 1.6741, + "step": 2037 + }, + { + "epoch": 0.1420060620841027, + "grad_norm": 0.6687369374009454, + "learning_rate": 6.691674962295562e-07, + "loss": 1.525, + "step": 2038 + }, + { + "epoch": 0.14207574121172004, + "grad_norm": 0.7433925112314416, + "learning_rate": 6.691377353613104e-07, + "loss": 1.4498, + "step": 2039 + }, + { + "epoch": 0.14214542033933736, + "grad_norm": 0.6745110392616911, + "learning_rate": 6.691079608766009e-07, + "loss": 1.6143, + "step": 2040 + }, + { + "epoch": 0.14221509946695468, + "grad_norm": 0.7382616845616087, + "learning_rate": 6.690781727768548e-07, + "loss": 1.541, + "step": 2041 + }, + { + "epoch": 0.142284778594572, + "grad_norm": 0.7045004156463536, + "learning_rate": 6.690483710634993e-07, + "loss": 1.4801, + "step": 2042 + }, + { + "epoch": 0.14235445772218933, + "grad_norm": 0.6965897989333067, + "learning_rate": 6.690185557379629e-07, + "loss": 1.489, + "step": 2043 + }, + { + "epoch": 0.14242413684980665, + "grad_norm": 0.7438196665174391, + "learning_rate": 6.689887268016741e-07, + "loss": 1.5633, + "step": 2044 + }, + { + "epoch": 0.14249381597742397, + "grad_norm": 0.7105715238696366, + "learning_rate": 6.689588842560625e-07, + "loss": 1.5686, + "step": 2045 + }, + { + "epoch": 0.1425634951050413, + "grad_norm": 0.7049215190610151, + "learning_rate": 6.689290281025582e-07, + "loss": 1.6438, + "step": 2046 + }, + { + "epoch": 0.14263317423265862, + "grad_norm": 0.6744774059408442, + "learning_rate": 6.68899158342592e-07, + "loss": 1.5095, + "step": 2047 + }, + { + "epoch": 0.14270285336027594, + "grad_norm": 0.680417356610093, + "learning_rate": 6.688692749775953e-07, + "loss": 1.5074, + "step": 2048 + }, + { + "epoch": 0.14277253248789326, + "grad_norm": 0.7151033904243572, + "learning_rate": 6.688393780090002e-07, + "loss": 1.604, + "step": 2049 + }, + { + "epoch": 0.14284221161551058, + "grad_norm": 0.724966418349942, + "learning_rate": 6.688094674382394e-07, + "loss": 1.5373, + "step": 2050 + }, + { + "epoch": 0.1429118907431279, + "grad_norm": 0.7360886176646285, + "learning_rate": 6.687795432667462e-07, + "loss": 1.5455, + "step": 2051 + }, + { + "epoch": 0.14298156987074523, + "grad_norm": 0.7172255708598817, + "learning_rate": 6.687496054959548e-07, + "loss": 1.6347, + "step": 2052 + }, + { + "epoch": 0.14305124899836255, + "grad_norm": 0.7699884407032171, + "learning_rate": 6.687196541272997e-07, + "loss": 1.6606, + "step": 2053 + }, + { + "epoch": 0.14312092812597987, + "grad_norm": 0.7014245175284832, + "learning_rate": 6.686896891622164e-07, + "loss": 1.6195, + "step": 2054 + }, + { + "epoch": 0.1431906072535972, + "grad_norm": 0.7050391085105184, + "learning_rate": 6.686597106021406e-07, + "loss": 1.4778, + "step": 2055 + }, + { + "epoch": 0.14326028638121452, + "grad_norm": 0.7475241606305587, + "learning_rate": 6.686297184485093e-07, + "loss": 1.522, + "step": 2056 + }, + { + "epoch": 0.14332996550883184, + "grad_norm": 0.7224955938632416, + "learning_rate": 6.685997127027597e-07, + "loss": 1.4863, + "step": 2057 + }, + { + "epoch": 0.14339964463644916, + "grad_norm": 0.7205299342896766, + "learning_rate": 6.685696933663295e-07, + "loss": 1.451, + "step": 2058 + }, + { + "epoch": 0.1434693237640665, + "grad_norm": 0.7375433453122535, + "learning_rate": 6.685396604406574e-07, + "loss": 1.5236, + "step": 2059 + }, + { + "epoch": 0.1435390028916838, + "grad_norm": 0.6683998772094839, + "learning_rate": 6.685096139271827e-07, + "loss": 1.519, + "step": 2060 + }, + { + "epoch": 0.14360868201930113, + "grad_norm": 0.7362126432095132, + "learning_rate": 6.684795538273452e-07, + "loss": 1.578, + "step": 2061 + }, + { + "epoch": 0.14367836114691845, + "grad_norm": 0.7395787426618212, + "learning_rate": 6.684494801425857e-07, + "loss": 1.5395, + "step": 2062 + }, + { + "epoch": 0.14374804027453578, + "grad_norm": 0.6894804607087299, + "learning_rate": 6.684193928743451e-07, + "loss": 1.5361, + "step": 2063 + }, + { + "epoch": 0.14381771940215307, + "grad_norm": 0.7135583941854905, + "learning_rate": 6.683892920240653e-07, + "loss": 1.4923, + "step": 2064 + }, + { + "epoch": 0.1438873985297704, + "grad_norm": 0.7829148162615202, + "learning_rate": 6.683591775931889e-07, + "loss": 1.5336, + "step": 2065 + }, + { + "epoch": 0.14395707765738772, + "grad_norm": 0.7303266441004318, + "learning_rate": 6.683290495831589e-07, + "loss": 1.6255, + "step": 2066 + }, + { + "epoch": 0.14402675678500504, + "grad_norm": 0.7839802506351182, + "learning_rate": 6.682989079954192e-07, + "loss": 1.5698, + "step": 2067 + }, + { + "epoch": 0.14409643591262236, + "grad_norm": 0.6629158355551916, + "learning_rate": 6.682687528314144e-07, + "loss": 1.5633, + "step": 2068 + }, + { + "epoch": 0.14416611504023968, + "grad_norm": 0.7241216398074919, + "learning_rate": 6.682385840925893e-07, + "loss": 1.545, + "step": 2069 + }, + { + "epoch": 0.144235794167857, + "grad_norm": 0.7164951634406063, + "learning_rate": 6.682084017803897e-07, + "loss": 1.4695, + "step": 2070 + }, + { + "epoch": 0.14430547329547433, + "grad_norm": 1.0271724115304646, + "learning_rate": 6.681782058962621e-07, + "loss": 1.4634, + "step": 2071 + }, + { + "epoch": 0.14437515242309165, + "grad_norm": 0.7823040178808669, + "learning_rate": 6.681479964416536e-07, + "loss": 1.5905, + "step": 2072 + }, + { + "epoch": 0.14444483155070897, + "grad_norm": 0.657817420957194, + "learning_rate": 6.681177734180118e-07, + "loss": 1.3752, + "step": 2073 + }, + { + "epoch": 0.1445145106783263, + "grad_norm": 0.6852262198461414, + "learning_rate": 6.68087536826785e-07, + "loss": 1.6397, + "step": 2074 + }, + { + "epoch": 0.14458418980594362, + "grad_norm": 0.756558074647055, + "learning_rate": 6.680572866694223e-07, + "loss": 1.635, + "step": 2075 + }, + { + "epoch": 0.14465386893356094, + "grad_norm": 0.7433246997372718, + "learning_rate": 6.680270229473733e-07, + "loss": 1.5365, + "step": 2076 + }, + { + "epoch": 0.14472354806117826, + "grad_norm": 0.78026047334549, + "learning_rate": 6.679967456620883e-07, + "loss": 1.5682, + "step": 2077 + }, + { + "epoch": 0.1447932271887956, + "grad_norm": 0.7072922446822634, + "learning_rate": 6.679664548150184e-07, + "loss": 1.4905, + "step": 2078 + }, + { + "epoch": 0.1448629063164129, + "grad_norm": 0.68509156320556, + "learning_rate": 6.67936150407615e-07, + "loss": 1.613, + "step": 2079 + }, + { + "epoch": 0.14493258544403023, + "grad_norm": 0.7336974814746486, + "learning_rate": 6.679058324413304e-07, + "loss": 1.6432, + "step": 2080 + }, + { + "epoch": 0.14500226457164755, + "grad_norm": 0.7801610741726437, + "learning_rate": 6.678755009176175e-07, + "loss": 1.7378, + "step": 2081 + }, + { + "epoch": 0.14507194369926488, + "grad_norm": 0.7463363382104677, + "learning_rate": 6.678451558379299e-07, + "loss": 1.5998, + "step": 2082 + }, + { + "epoch": 0.1451416228268822, + "grad_norm": 0.7695298804194249, + "learning_rate": 6.678147972037217e-07, + "loss": 1.4838, + "step": 2083 + }, + { + "epoch": 0.14521130195449952, + "grad_norm": 0.7298021201070521, + "learning_rate": 6.677844250164478e-07, + "loss": 1.7206, + "step": 2084 + }, + { + "epoch": 0.14528098108211684, + "grad_norm": 0.7045264264551108, + "learning_rate": 6.677540392775638e-07, + "loss": 1.6674, + "step": 2085 + }, + { + "epoch": 0.14535066020973417, + "grad_norm": 0.7080820049733552, + "learning_rate": 6.677236399885257e-07, + "loss": 1.3703, + "step": 2086 + }, + { + "epoch": 0.1454203393373515, + "grad_norm": 0.7405645499986991, + "learning_rate": 6.676932271507905e-07, + "loss": 1.6083, + "step": 2087 + }, + { + "epoch": 0.1454900184649688, + "grad_norm": 0.6755073635241531, + "learning_rate": 6.676628007658152e-07, + "loss": 1.4891, + "step": 2088 + }, + { + "epoch": 0.14555969759258613, + "grad_norm": 0.735196926318889, + "learning_rate": 6.676323608350584e-07, + "loss": 1.5538, + "step": 2089 + }, + { + "epoch": 0.14562937672020346, + "grad_norm": 0.7158832810121535, + "learning_rate": 6.676019073599786e-07, + "loss": 1.4395, + "step": 2090 + }, + { + "epoch": 0.14569905584782078, + "grad_norm": 0.8233995506467929, + "learning_rate": 6.675714403420352e-07, + "loss": 1.5909, + "step": 2091 + }, + { + "epoch": 0.1457687349754381, + "grad_norm": 0.6981047452018204, + "learning_rate": 6.675409597826883e-07, + "loss": 1.5379, + "step": 2092 + }, + { + "epoch": 0.14583841410305542, + "grad_norm": 0.7318049866634618, + "learning_rate": 6.675104656833985e-07, + "loss": 1.5167, + "step": 2093 + }, + { + "epoch": 0.14590809323067275, + "grad_norm": 0.6711256547636123, + "learning_rate": 6.674799580456273e-07, + "loss": 1.4706, + "step": 2094 + }, + { + "epoch": 0.14597777235829007, + "grad_norm": 0.7807090030494048, + "learning_rate": 6.674494368708365e-07, + "loss": 1.6408, + "step": 2095 + }, + { + "epoch": 0.1460474514859074, + "grad_norm": 0.7560459522384863, + "learning_rate": 6.674189021604889e-07, + "loss": 1.5771, + "step": 2096 + }, + { + "epoch": 0.14611713061352471, + "grad_norm": 0.7340658045296877, + "learning_rate": 6.673883539160478e-07, + "loss": 1.7691, + "step": 2097 + }, + { + "epoch": 0.14618680974114204, + "grad_norm": 0.7166206168364373, + "learning_rate": 6.673577921389768e-07, + "loss": 1.5528, + "step": 2098 + }, + { + "epoch": 0.14625648886875936, + "grad_norm": 0.7247772846536584, + "learning_rate": 6.673272168307408e-07, + "loss": 1.5739, + "step": 2099 + }, + { + "epoch": 0.14632616799637668, + "grad_norm": 0.7122755245520854, + "learning_rate": 6.672966279928051e-07, + "loss": 1.517, + "step": 2100 + }, + { + "epoch": 0.146395847123994, + "grad_norm": 0.709517658767935, + "learning_rate": 6.672660256266353e-07, + "loss": 1.5352, + "step": 2101 + }, + { + "epoch": 0.14646552625161133, + "grad_norm": 0.7526325805880144, + "learning_rate": 6.672354097336982e-07, + "loss": 1.4883, + "step": 2102 + }, + { + "epoch": 0.14653520537922865, + "grad_norm": 0.6888521999954886, + "learning_rate": 6.672047803154608e-07, + "loss": 1.6047, + "step": 2103 + }, + { + "epoch": 0.14660488450684597, + "grad_norm": 0.7384235333319291, + "learning_rate": 6.671741373733909e-07, + "loss": 1.5317, + "step": 2104 + }, + { + "epoch": 0.1466745636344633, + "grad_norm": 0.6736086119358556, + "learning_rate": 6.671434809089571e-07, + "loss": 1.5422, + "step": 2105 + }, + { + "epoch": 0.14674424276208062, + "grad_norm": 0.6661337493931547, + "learning_rate": 6.671128109236285e-07, + "loss": 1.501, + "step": 2106 + }, + { + "epoch": 0.14681392188969794, + "grad_norm": 0.7206096510228985, + "learning_rate": 6.670821274188747e-07, + "loss": 1.5317, + "step": 2107 + }, + { + "epoch": 0.14688360101731526, + "grad_norm": 0.6859249482438138, + "learning_rate": 6.670514303961664e-07, + "loss": 1.5287, + "step": 2108 + }, + { + "epoch": 0.14695328014493259, + "grad_norm": 0.7183963613016431, + "learning_rate": 6.670207198569745e-07, + "loss": 1.5517, + "step": 2109 + }, + { + "epoch": 0.1470229592725499, + "grad_norm": 0.7341898396673695, + "learning_rate": 6.669899958027707e-07, + "loss": 1.6989, + "step": 2110 + }, + { + "epoch": 0.14709263840016723, + "grad_norm": 0.7019688944440516, + "learning_rate": 6.669592582350273e-07, + "loss": 1.5366, + "step": 2111 + }, + { + "epoch": 0.14716231752778455, + "grad_norm": 0.7247432024634127, + "learning_rate": 6.669285071552174e-07, + "loss": 1.5756, + "step": 2112 + }, + { + "epoch": 0.14723199665540188, + "grad_norm": 0.7513673208989898, + "learning_rate": 6.668977425648149e-07, + "loss": 1.5718, + "step": 2113 + }, + { + "epoch": 0.1473016757830192, + "grad_norm": 0.7819729777404506, + "learning_rate": 6.668669644652937e-07, + "loss": 1.5458, + "step": 2114 + }, + { + "epoch": 0.14737135491063652, + "grad_norm": 0.750342648657007, + "learning_rate": 6.668361728581288e-07, + "loss": 1.4844, + "step": 2115 + }, + { + "epoch": 0.14744103403825384, + "grad_norm": 0.6908345407203063, + "learning_rate": 6.66805367744796e-07, + "loss": 1.6642, + "step": 2116 + }, + { + "epoch": 0.14751071316587117, + "grad_norm": 0.7344250712154198, + "learning_rate": 6.667745491267713e-07, + "loss": 1.6705, + "step": 2117 + }, + { + "epoch": 0.1475803922934885, + "grad_norm": 0.7510819008972629, + "learning_rate": 6.667437170055319e-07, + "loss": 1.5469, + "step": 2118 + }, + { + "epoch": 0.1476500714211058, + "grad_norm": 0.7555665446815104, + "learning_rate": 6.66712871382555e-07, + "loss": 1.6687, + "step": 2119 + }, + { + "epoch": 0.14771975054872313, + "grad_norm": 0.6604616741494789, + "learning_rate": 6.666820122593192e-07, + "loss": 1.5087, + "step": 2120 + }, + { + "epoch": 0.14778942967634046, + "grad_norm": 0.7348443193736736, + "learning_rate": 6.666511396373029e-07, + "loss": 1.5052, + "step": 2121 + }, + { + "epoch": 0.14785910880395778, + "grad_norm": 0.672958863560216, + "learning_rate": 6.666202535179859e-07, + "loss": 1.4961, + "step": 2122 + }, + { + "epoch": 0.1479287879315751, + "grad_norm": 0.7414274787253741, + "learning_rate": 6.665893539028481e-07, + "loss": 1.6306, + "step": 2123 + }, + { + "epoch": 0.14799846705919242, + "grad_norm": 0.727858108215168, + "learning_rate": 6.665584407933704e-07, + "loss": 1.5347, + "step": 2124 + }, + { + "epoch": 0.14806814618680975, + "grad_norm": 0.7294702840040114, + "learning_rate": 6.665275141910343e-07, + "loss": 1.3667, + "step": 2125 + }, + { + "epoch": 0.14813782531442707, + "grad_norm": 0.7349044745056671, + "learning_rate": 6.664965740973216e-07, + "loss": 1.7463, + "step": 2126 + }, + { + "epoch": 0.1482075044420444, + "grad_norm": 0.7758881380439382, + "learning_rate": 6.664656205137151e-07, + "loss": 1.6216, + "step": 2127 + }, + { + "epoch": 0.1482771835696617, + "grad_norm": 0.7495364666245434, + "learning_rate": 6.664346534416984e-07, + "loss": 1.6273, + "step": 2128 + }, + { + "epoch": 0.14834686269727904, + "grad_norm": 0.7040089705837504, + "learning_rate": 6.664036728827553e-07, + "loss": 1.5802, + "step": 2129 + }, + { + "epoch": 0.14841654182489636, + "grad_norm": 0.6667731454990864, + "learning_rate": 6.663726788383704e-07, + "loss": 1.5177, + "step": 2130 + }, + { + "epoch": 0.14848622095251368, + "grad_norm": 0.6997476564626672, + "learning_rate": 6.663416713100291e-07, + "loss": 1.5506, + "step": 2131 + }, + { + "epoch": 0.148555900080131, + "grad_norm": 0.6911327597765583, + "learning_rate": 6.663106502992175e-07, + "loss": 1.4742, + "step": 2132 + }, + { + "epoch": 0.14862557920774833, + "grad_norm": 0.7187628023471965, + "learning_rate": 6.662796158074218e-07, + "loss": 1.5424, + "step": 2133 + }, + { + "epoch": 0.14869525833536565, + "grad_norm": 0.8128369317563799, + "learning_rate": 6.662485678361296e-07, + "loss": 1.5732, + "step": 2134 + }, + { + "epoch": 0.14876493746298297, + "grad_norm": 0.782738410518543, + "learning_rate": 6.662175063868286e-07, + "loss": 1.6439, + "step": 2135 + }, + { + "epoch": 0.1488346165906003, + "grad_norm": 0.7095268270329291, + "learning_rate": 6.661864314610073e-07, + "loss": 1.5901, + "step": 2136 + }, + { + "epoch": 0.14890429571821762, + "grad_norm": 0.6888819253469545, + "learning_rate": 6.66155343060155e-07, + "loss": 1.4262, + "step": 2137 + }, + { + "epoch": 0.14897397484583494, + "grad_norm": 0.698034927176739, + "learning_rate": 6.661242411857614e-07, + "loss": 1.4935, + "step": 2138 + }, + { + "epoch": 0.14904365397345226, + "grad_norm": 0.7034207813846144, + "learning_rate": 6.660931258393171e-07, + "loss": 1.6056, + "step": 2139 + }, + { + "epoch": 0.14911333310106958, + "grad_norm": 0.714720004406207, + "learning_rate": 6.66061997022313e-07, + "loss": 1.4297, + "step": 2140 + }, + { + "epoch": 0.1491830122286869, + "grad_norm": 0.70720621104776, + "learning_rate": 6.660308547362412e-07, + "loss": 1.5657, + "step": 2141 + }, + { + "epoch": 0.14925269135630423, + "grad_norm": 0.6818808254535282, + "learning_rate": 6.659996989825938e-07, + "loss": 1.4919, + "step": 2142 + }, + { + "epoch": 0.14932237048392155, + "grad_norm": 1.2047689714013203, + "learning_rate": 6.65968529762864e-07, + "loss": 1.6194, + "step": 2143 + }, + { + "epoch": 0.14939204961153887, + "grad_norm": 0.7234531463009618, + "learning_rate": 6.659373470785454e-07, + "loss": 1.6053, + "step": 2144 + }, + { + "epoch": 0.1494617287391562, + "grad_norm": 0.8116627746519786, + "learning_rate": 6.659061509311322e-07, + "loss": 1.5547, + "step": 2145 + }, + { + "epoch": 0.14953140786677352, + "grad_norm": 0.7010744160731812, + "learning_rate": 6.658749413221197e-07, + "loss": 1.6412, + "step": 2146 + }, + { + "epoch": 0.14960108699439084, + "grad_norm": 0.7281161646233438, + "learning_rate": 6.658437182530034e-07, + "loss": 1.6422, + "step": 2147 + }, + { + "epoch": 0.14967076612200816, + "grad_norm": 0.7123633183041211, + "learning_rate": 6.658124817252796e-07, + "loss": 1.6012, + "step": 2148 + }, + { + "epoch": 0.1497404452496255, + "grad_norm": 0.7779085968751687, + "learning_rate": 6.657812317404451e-07, + "loss": 1.5819, + "step": 2149 + }, + { + "epoch": 0.1498101243772428, + "grad_norm": 0.7245364414178849, + "learning_rate": 6.657499682999976e-07, + "loss": 1.6811, + "step": 2150 + }, + { + "epoch": 0.14987980350486013, + "grad_norm": 0.7291548999158717, + "learning_rate": 6.657186914054353e-07, + "loss": 1.5303, + "step": 2151 + }, + { + "epoch": 0.14994948263247745, + "grad_norm": 0.6850518007923204, + "learning_rate": 6.656874010582568e-07, + "loss": 1.6017, + "step": 2152 + }, + { + "epoch": 0.15001916176009478, + "grad_norm": 0.7636515207105723, + "learning_rate": 6.65656097259962e-07, + "loss": 1.6767, + "step": 2153 + }, + { + "epoch": 0.1500888408877121, + "grad_norm": 0.7095907892985733, + "learning_rate": 6.656247800120508e-07, + "loss": 1.5909, + "step": 2154 + }, + { + "epoch": 0.15015852001532942, + "grad_norm": 0.8017043567634776, + "learning_rate": 6.65593449316024e-07, + "loss": 1.6075, + "step": 2155 + }, + { + "epoch": 0.15022819914294672, + "grad_norm": 0.7075063766905534, + "learning_rate": 6.655621051733831e-07, + "loss": 1.5024, + "step": 2156 + }, + { + "epoch": 0.15029787827056404, + "grad_norm": 0.7010902957564245, + "learning_rate": 6.6553074758563e-07, + "loss": 1.5442, + "step": 2157 + }, + { + "epoch": 0.15036755739818136, + "grad_norm": 0.6911690892841524, + "learning_rate": 6.654993765542677e-07, + "loss": 1.539, + "step": 2158 + }, + { + "epoch": 0.15043723652579868, + "grad_norm": 0.707345315596661, + "learning_rate": 6.654679920807994e-07, + "loss": 1.4807, + "step": 2159 + }, + { + "epoch": 0.150506915653416, + "grad_norm": 0.7415392921869378, + "learning_rate": 6.654365941667291e-07, + "loss": 1.5474, + "step": 2160 + }, + { + "epoch": 0.15057659478103333, + "grad_norm": 0.706173361060631, + "learning_rate": 6.654051828135615e-07, + "loss": 1.4426, + "step": 2161 + }, + { + "epoch": 0.15064627390865065, + "grad_norm": 0.7303397511035342, + "learning_rate": 6.653737580228018e-07, + "loss": 1.5252, + "step": 2162 + }, + { + "epoch": 0.15071595303626797, + "grad_norm": 0.6801700281353229, + "learning_rate": 6.653423197959561e-07, + "loss": 1.6305, + "step": 2163 + }, + { + "epoch": 0.1507856321638853, + "grad_norm": 0.7319612660717305, + "learning_rate": 6.653108681345308e-07, + "loss": 1.5535, + "step": 2164 + }, + { + "epoch": 0.15085531129150262, + "grad_norm": 0.739665723589392, + "learning_rate": 6.652794030400334e-07, + "loss": 1.4885, + "step": 2165 + }, + { + "epoch": 0.15092499041911994, + "grad_norm": 0.7300207068686901, + "learning_rate": 6.652479245139715e-07, + "loss": 1.556, + "step": 2166 + }, + { + "epoch": 0.15099466954673726, + "grad_norm": 0.7478877415893503, + "learning_rate": 6.652164325578538e-07, + "loss": 1.5387, + "step": 2167 + }, + { + "epoch": 0.1510643486743546, + "grad_norm": 0.7583353301827843, + "learning_rate": 6.651849271731893e-07, + "loss": 1.5861, + "step": 2168 + }, + { + "epoch": 0.1511340278019719, + "grad_norm": 0.7393233041700598, + "learning_rate": 6.651534083614879e-07, + "loss": 1.6991, + "step": 2169 + }, + { + "epoch": 0.15120370692958923, + "grad_norm": 0.6878120430029538, + "learning_rate": 6.6512187612426e-07, + "loss": 1.5472, + "step": 2170 + }, + { + "epoch": 0.15127338605720655, + "grad_norm": 0.7431748916767359, + "learning_rate": 6.650903304630168e-07, + "loss": 1.5066, + "step": 2171 + }, + { + "epoch": 0.15134306518482388, + "grad_norm": 0.7058577241054244, + "learning_rate": 6.650587713792698e-07, + "loss": 1.4571, + "step": 2172 + }, + { + "epoch": 0.1514127443124412, + "grad_norm": 0.7446431893400042, + "learning_rate": 6.650271988745317e-07, + "loss": 1.4761, + "step": 2173 + }, + { + "epoch": 0.15148242344005852, + "grad_norm": 0.7038821399942174, + "learning_rate": 6.649956129503152e-07, + "loss": 1.4445, + "step": 2174 + }, + { + "epoch": 0.15155210256767584, + "grad_norm": 0.8191464308394847, + "learning_rate": 6.649640136081343e-07, + "loss": 1.5595, + "step": 2175 + }, + { + "epoch": 0.15162178169529317, + "grad_norm": 0.7133519258464575, + "learning_rate": 6.64932400849503e-07, + "loss": 1.5781, + "step": 2176 + }, + { + "epoch": 0.1516914608229105, + "grad_norm": 0.6795663922248834, + "learning_rate": 6.649007746759362e-07, + "loss": 1.6141, + "step": 2177 + }, + { + "epoch": 0.1517611399505278, + "grad_norm": 0.69403858097068, + "learning_rate": 6.648691350889498e-07, + "loss": 1.4282, + "step": 2178 + }, + { + "epoch": 0.15183081907814514, + "grad_norm": 0.7119608330094656, + "learning_rate": 6.6483748209006e-07, + "loss": 1.3755, + "step": 2179 + }, + { + "epoch": 0.15190049820576246, + "grad_norm": 0.7564469568638418, + "learning_rate": 6.648058156807836e-07, + "loss": 1.5567, + "step": 2180 + }, + { + "epoch": 0.15197017733337978, + "grad_norm": 0.7486844475397654, + "learning_rate": 6.647741358626378e-07, + "loss": 1.5791, + "step": 2181 + }, + { + "epoch": 0.1520398564609971, + "grad_norm": 0.7055998936778343, + "learning_rate": 6.647424426371411e-07, + "loss": 1.5442, + "step": 2182 + }, + { + "epoch": 0.15210953558861443, + "grad_norm": 0.7470502207739791, + "learning_rate": 6.647107360058124e-07, + "loss": 1.5793, + "step": 2183 + }, + { + "epoch": 0.15217921471623175, + "grad_norm": 0.7341460938876649, + "learning_rate": 6.64679015970171e-07, + "loss": 1.5151, + "step": 2184 + }, + { + "epoch": 0.15224889384384907, + "grad_norm": 0.7346400065000422, + "learning_rate": 6.646472825317368e-07, + "loss": 1.5545, + "step": 2185 + }, + { + "epoch": 0.1523185729714664, + "grad_norm": 0.7201065762380195, + "learning_rate": 6.646155356920309e-07, + "loss": 1.5169, + "step": 2186 + }, + { + "epoch": 0.15238825209908372, + "grad_norm": 0.7393498689538193, + "learning_rate": 6.645837754525743e-07, + "loss": 1.5674, + "step": 2187 + }, + { + "epoch": 0.15245793122670104, + "grad_norm": 0.7785664711016976, + "learning_rate": 6.645520018148894e-07, + "loss": 1.5922, + "step": 2188 + }, + { + "epoch": 0.15252761035431836, + "grad_norm": 0.7293345596385744, + "learning_rate": 6.645202147804986e-07, + "loss": 1.5481, + "step": 2189 + }, + { + "epoch": 0.15259728948193568, + "grad_norm": 0.6894072142217631, + "learning_rate": 6.644884143509253e-07, + "loss": 1.389, + "step": 2190 + }, + { + "epoch": 0.152666968609553, + "grad_norm": 0.6740333835131167, + "learning_rate": 6.644566005276932e-07, + "loss": 1.572, + "step": 2191 + }, + { + "epoch": 0.15273664773717033, + "grad_norm": 0.6956811615086338, + "learning_rate": 6.644247733123273e-07, + "loss": 1.5829, + "step": 2192 + }, + { + "epoch": 0.15280632686478765, + "grad_norm": 0.7320923923783303, + "learning_rate": 6.643929327063526e-07, + "loss": 1.628, + "step": 2193 + }, + { + "epoch": 0.15287600599240497, + "grad_norm": 0.7024646754235628, + "learning_rate": 6.643610787112949e-07, + "loss": 1.4922, + "step": 2194 + }, + { + "epoch": 0.1529456851200223, + "grad_norm": 0.7097415544166261, + "learning_rate": 6.643292113286809e-07, + "loss": 1.5992, + "step": 2195 + }, + { + "epoch": 0.15301536424763962, + "grad_norm": 0.6985595433662738, + "learning_rate": 6.642973305600375e-07, + "loss": 1.4852, + "step": 2196 + }, + { + "epoch": 0.15308504337525694, + "grad_norm": 0.6991483533810473, + "learning_rate": 6.642654364068927e-07, + "loss": 1.4809, + "step": 2197 + }, + { + "epoch": 0.15315472250287426, + "grad_norm": 0.7908856281336831, + "learning_rate": 6.642335288707749e-07, + "loss": 1.5996, + "step": 2198 + }, + { + "epoch": 0.15322440163049159, + "grad_norm": 0.696976024748579, + "learning_rate": 6.642016079532131e-07, + "loss": 1.5017, + "step": 2199 + }, + { + "epoch": 0.1532940807581089, + "grad_norm": 0.6691993666994807, + "learning_rate": 6.64169673655737e-07, + "loss": 1.5449, + "step": 2200 + }, + { + "epoch": 0.15336375988572623, + "grad_norm": 0.738678893957917, + "learning_rate": 6.641377259798771e-07, + "loss": 1.5894, + "step": 2201 + }, + { + "epoch": 0.15343343901334355, + "grad_norm": 0.6999886972654056, + "learning_rate": 6.641057649271644e-07, + "loss": 1.5552, + "step": 2202 + }, + { + "epoch": 0.15350311814096088, + "grad_norm": 0.7839373696668602, + "learning_rate": 6.640737904991303e-07, + "loss": 1.628, + "step": 2203 + }, + { + "epoch": 0.1535727972685782, + "grad_norm": 0.6958760832939552, + "learning_rate": 6.640418026973073e-07, + "loss": 1.5806, + "step": 2204 + }, + { + "epoch": 0.15364247639619552, + "grad_norm": 0.6935626553735474, + "learning_rate": 6.640098015232282e-07, + "loss": 1.5275, + "step": 2205 + }, + { + "epoch": 0.15371215552381284, + "grad_norm": 0.7681046260488631, + "learning_rate": 6.639777869784266e-07, + "loss": 1.6231, + "step": 2206 + }, + { + "epoch": 0.15378183465143017, + "grad_norm": 0.6894945589980458, + "learning_rate": 6.639457590644367e-07, + "loss": 1.4775, + "step": 2207 + }, + { + "epoch": 0.1538515137790475, + "grad_norm": 0.7261850091840004, + "learning_rate": 6.639137177827935e-07, + "loss": 1.5407, + "step": 2208 + }, + { + "epoch": 0.1539211929066648, + "grad_norm": 0.6606501584232484, + "learning_rate": 6.638816631350324e-07, + "loss": 1.4989, + "step": 2209 + }, + { + "epoch": 0.15399087203428213, + "grad_norm": 0.7152784548111163, + "learning_rate": 6.638495951226891e-07, + "loss": 1.4855, + "step": 2210 + }, + { + "epoch": 0.15406055116189946, + "grad_norm": 0.7522656609768341, + "learning_rate": 6.638175137473011e-07, + "loss": 1.5111, + "step": 2211 + }, + { + "epoch": 0.15413023028951678, + "grad_norm": 0.7745867847921484, + "learning_rate": 6.637854190104053e-07, + "loss": 1.534, + "step": 2212 + }, + { + "epoch": 0.1541999094171341, + "grad_norm": 0.7233835267335126, + "learning_rate": 6.637533109135399e-07, + "loss": 1.623, + "step": 2213 + }, + { + "epoch": 0.15426958854475142, + "grad_norm": 0.7270410330528998, + "learning_rate": 6.637211894582435e-07, + "loss": 1.6425, + "step": 2214 + }, + { + "epoch": 0.15433926767236875, + "grad_norm": 0.7697491418177266, + "learning_rate": 6.636890546460556e-07, + "loss": 1.5283, + "step": 2215 + }, + { + "epoch": 0.15440894679998607, + "grad_norm": 0.7117412043799898, + "learning_rate": 6.636569064785162e-07, + "loss": 1.5912, + "step": 2216 + }, + { + "epoch": 0.1544786259276034, + "grad_norm": 0.7505816377254368, + "learning_rate": 6.636247449571654e-07, + "loss": 1.458, + "step": 2217 + }, + { + "epoch": 0.1545483050552207, + "grad_norm": 0.6735296173568921, + "learning_rate": 6.63592570083545e-07, + "loss": 1.5044, + "step": 2218 + }, + { + "epoch": 0.15461798418283804, + "grad_norm": 0.6846941209657414, + "learning_rate": 6.635603818591967e-07, + "loss": 1.5567, + "step": 2219 + }, + { + "epoch": 0.15468766331045536, + "grad_norm": 0.6929598112898522, + "learning_rate": 6.635281802856632e-07, + "loss": 1.5509, + "step": 2220 + }, + { + "epoch": 0.15475734243807268, + "grad_norm": 0.6792500496086945, + "learning_rate": 6.634959653644873e-07, + "loss": 1.5822, + "step": 2221 + }, + { + "epoch": 0.15482702156569, + "grad_norm": 0.6842858576959354, + "learning_rate": 6.634637370972131e-07, + "loss": 1.5215, + "step": 2222 + }, + { + "epoch": 0.15489670069330733, + "grad_norm": 0.6924953687899594, + "learning_rate": 6.634314954853847e-07, + "loss": 1.5317, + "step": 2223 + }, + { + "epoch": 0.15496637982092465, + "grad_norm": 0.7730565766794356, + "learning_rate": 6.633992405305477e-07, + "loss": 1.5001, + "step": 2224 + }, + { + "epoch": 0.15503605894854197, + "grad_norm": 0.7204812325831732, + "learning_rate": 6.633669722342475e-07, + "loss": 1.5332, + "step": 2225 + }, + { + "epoch": 0.1551057380761593, + "grad_norm": 0.6750704593792517, + "learning_rate": 6.633346905980304e-07, + "loss": 1.5395, + "step": 2226 + }, + { + "epoch": 0.15517541720377662, + "grad_norm": 0.7399175686766039, + "learning_rate": 6.633023956234436e-07, + "loss": 1.6439, + "step": 2227 + }, + { + "epoch": 0.15524509633139394, + "grad_norm": 0.6596536831519212, + "learning_rate": 6.632700873120346e-07, + "loss": 1.4794, + "step": 2228 + }, + { + "epoch": 0.15531477545901126, + "grad_norm": 0.6992076941173299, + "learning_rate": 6.632377656653518e-07, + "loss": 1.5523, + "step": 2229 + }, + { + "epoch": 0.15538445458662858, + "grad_norm": 0.680191070215852, + "learning_rate": 6.63205430684944e-07, + "loss": 1.7234, + "step": 2230 + }, + { + "epoch": 0.1554541337142459, + "grad_norm": 0.7405456874943854, + "learning_rate": 6.631730823723609e-07, + "loss": 1.6782, + "step": 2231 + }, + { + "epoch": 0.15552381284186323, + "grad_norm": 0.7827715409835366, + "learning_rate": 6.631407207291526e-07, + "loss": 1.5043, + "step": 2232 + }, + { + "epoch": 0.15559349196948055, + "grad_norm": 0.7125663974576547, + "learning_rate": 6.631083457568699e-07, + "loss": 1.5413, + "step": 2233 + }, + { + "epoch": 0.15566317109709787, + "grad_norm": 0.7187003784037695, + "learning_rate": 6.630759574570644e-07, + "loss": 1.4771, + "step": 2234 + }, + { + "epoch": 0.1557328502247152, + "grad_norm": 0.6781974858369967, + "learning_rate": 6.63043555831288e-07, + "loss": 1.523, + "step": 2235 + }, + { + "epoch": 0.15580252935233252, + "grad_norm": 0.6951594427619806, + "learning_rate": 6.630111408810937e-07, + "loss": 1.6086, + "step": 2236 + }, + { + "epoch": 0.15587220847994984, + "grad_norm": 0.7338407170382629, + "learning_rate": 6.629787126080348e-07, + "loss": 1.6122, + "step": 2237 + }, + { + "epoch": 0.15594188760756716, + "grad_norm": 0.7159255072884011, + "learning_rate": 6.629462710136653e-07, + "loss": 1.5316, + "step": 2238 + }, + { + "epoch": 0.1560115667351845, + "grad_norm": 0.7532205783926076, + "learning_rate": 6.629138160995399e-07, + "loss": 1.7628, + "step": 2239 + }, + { + "epoch": 0.1560812458628018, + "grad_norm": 0.6981258987523404, + "learning_rate": 6.628813478672139e-07, + "loss": 1.4858, + "step": 2240 + }, + { + "epoch": 0.15615092499041913, + "grad_norm": 0.6879762245284705, + "learning_rate": 6.628488663182431e-07, + "loss": 1.5243, + "step": 2241 + }, + { + "epoch": 0.15622060411803645, + "grad_norm": 0.7106885629753927, + "learning_rate": 6.628163714541844e-07, + "loss": 1.4806, + "step": 2242 + }, + { + "epoch": 0.15629028324565378, + "grad_norm": 0.774813261180234, + "learning_rate": 6.627838632765946e-07, + "loss": 1.5369, + "step": 2243 + }, + { + "epoch": 0.1563599623732711, + "grad_norm": 0.7211700093562131, + "learning_rate": 6.627513417870319e-07, + "loss": 1.5611, + "step": 2244 + }, + { + "epoch": 0.15642964150088842, + "grad_norm": 0.6932231497601744, + "learning_rate": 6.627188069870548e-07, + "loss": 1.4099, + "step": 2245 + }, + { + "epoch": 0.15649932062850574, + "grad_norm": 0.7036873950506464, + "learning_rate": 6.626862588782221e-07, + "loss": 1.5305, + "step": 2246 + }, + { + "epoch": 0.15656899975612304, + "grad_norm": 0.7089690133313291, + "learning_rate": 6.626536974620939e-07, + "loss": 1.4183, + "step": 2247 + }, + { + "epoch": 0.15663867888374036, + "grad_norm": 0.7105684077832143, + "learning_rate": 6.626211227402306e-07, + "loss": 1.5593, + "step": 2248 + }, + { + "epoch": 0.15670835801135768, + "grad_norm": 0.6832997394422372, + "learning_rate": 6.625885347141931e-07, + "loss": 1.4509, + "step": 2249 + }, + { + "epoch": 0.156778037138975, + "grad_norm": 0.6977345495829199, + "learning_rate": 6.625559333855431e-07, + "loss": 1.6193, + "step": 2250 + }, + { + "epoch": 0.15684771626659233, + "grad_norm": 0.6908184277057992, + "learning_rate": 6.625233187558431e-07, + "loss": 1.5806, + "step": 2251 + }, + { + "epoch": 0.15691739539420965, + "grad_norm": 0.7326095761597486, + "learning_rate": 6.624906908266557e-07, + "loss": 1.6088, + "step": 2252 + }, + { + "epoch": 0.15698707452182697, + "grad_norm": 0.650882245462365, + "learning_rate": 6.624580495995447e-07, + "loss": 1.4173, + "step": 2253 + }, + { + "epoch": 0.1570567536494443, + "grad_norm": 0.7035964201999952, + "learning_rate": 6.624253950760746e-07, + "loss": 1.6664, + "step": 2254 + }, + { + "epoch": 0.15712643277706162, + "grad_norm": 0.7171947278524486, + "learning_rate": 6.623927272578098e-07, + "loss": 1.5436, + "step": 2255 + }, + { + "epoch": 0.15719611190467894, + "grad_norm": 0.7582978366124182, + "learning_rate": 6.623600461463162e-07, + "loss": 1.5061, + "step": 2256 + }, + { + "epoch": 0.15726579103229626, + "grad_norm": 0.7197080632735179, + "learning_rate": 6.623273517431597e-07, + "loss": 1.4875, + "step": 2257 + }, + { + "epoch": 0.1573354701599136, + "grad_norm": 0.6975155150989609, + "learning_rate": 6.622946440499072e-07, + "loss": 1.375, + "step": 2258 + }, + { + "epoch": 0.1574051492875309, + "grad_norm": 0.7211246058188371, + "learning_rate": 6.62261923068126e-07, + "loss": 1.5137, + "step": 2259 + }, + { + "epoch": 0.15747482841514823, + "grad_norm": 0.6786300156106961, + "learning_rate": 6.622291887993843e-07, + "loss": 1.4291, + "step": 2260 + }, + { + "epoch": 0.15754450754276556, + "grad_norm": 0.7286101497808452, + "learning_rate": 6.621964412452507e-07, + "loss": 1.4559, + "step": 2261 + }, + { + "epoch": 0.15761418667038288, + "grad_norm": 0.6679841831373213, + "learning_rate": 6.621636804072947e-07, + "loss": 1.4893, + "step": 2262 + }, + { + "epoch": 0.1576838657980002, + "grad_norm": 0.7239015658745319, + "learning_rate": 6.621309062870859e-07, + "loss": 1.5985, + "step": 2263 + }, + { + "epoch": 0.15775354492561752, + "grad_norm": 0.8033552658475741, + "learning_rate": 6.620981188861952e-07, + "loss": 1.5316, + "step": 2264 + }, + { + "epoch": 0.15782322405323485, + "grad_norm": 0.8488918942742487, + "learning_rate": 6.620653182061938e-07, + "loss": 1.6772, + "step": 2265 + }, + { + "epoch": 0.15789290318085217, + "grad_norm": 0.7367837946918524, + "learning_rate": 6.620325042486537e-07, + "loss": 1.5063, + "step": 2266 + }, + { + "epoch": 0.1579625823084695, + "grad_norm": 0.7017635363309525, + "learning_rate": 6.619996770151471e-07, + "loss": 1.3885, + "step": 2267 + }, + { + "epoch": 0.1580322614360868, + "grad_norm": 0.7708473358379301, + "learning_rate": 6.619668365072472e-07, + "loss": 1.5045, + "step": 2268 + }, + { + "epoch": 0.15810194056370414, + "grad_norm": 0.6922310385911277, + "learning_rate": 6.619339827265281e-07, + "loss": 1.445, + "step": 2269 + }, + { + "epoch": 0.15817161969132146, + "grad_norm": 0.7504965139466662, + "learning_rate": 6.619011156745639e-07, + "loss": 1.5939, + "step": 2270 + }, + { + "epoch": 0.15824129881893878, + "grad_norm": 0.7091372028940393, + "learning_rate": 6.618682353529299e-07, + "loss": 1.5027, + "step": 2271 + }, + { + "epoch": 0.1583109779465561, + "grad_norm": 0.6747431666388362, + "learning_rate": 6.618353417632015e-07, + "loss": 1.5093, + "step": 2272 + }, + { + "epoch": 0.15838065707417343, + "grad_norm": 0.7506009039438517, + "learning_rate": 6.618024349069553e-07, + "loss": 1.5443, + "step": 2273 + }, + { + "epoch": 0.15845033620179075, + "grad_norm": 0.7136774347676139, + "learning_rate": 6.617695147857681e-07, + "loss": 1.4922, + "step": 2274 + }, + { + "epoch": 0.15852001532940807, + "grad_norm": 0.6607762672674673, + "learning_rate": 6.617365814012176e-07, + "loss": 1.4119, + "step": 2275 + }, + { + "epoch": 0.1585896944570254, + "grad_norm": 0.7219437275521969, + "learning_rate": 6.61703634754882e-07, + "loss": 1.5683, + "step": 2276 + }, + { + "epoch": 0.15865937358464272, + "grad_norm": 0.7557107319667966, + "learning_rate": 6.616706748483401e-07, + "loss": 1.6325, + "step": 2277 + }, + { + "epoch": 0.15872905271226004, + "grad_norm": 0.7296816006462254, + "learning_rate": 6.616377016831715e-07, + "loss": 1.6051, + "step": 2278 + }, + { + "epoch": 0.15879873183987736, + "grad_norm": 0.6952674966414605, + "learning_rate": 6.616047152609562e-07, + "loss": 1.484, + "step": 2279 + }, + { + "epoch": 0.15886841096749468, + "grad_norm": 0.7239655613929008, + "learning_rate": 6.615717155832753e-07, + "loss": 1.4567, + "step": 2280 + }, + { + "epoch": 0.158938090095112, + "grad_norm": 0.6928217629736545, + "learning_rate": 6.615387026517097e-07, + "loss": 1.6081, + "step": 2281 + }, + { + "epoch": 0.15900776922272933, + "grad_norm": 0.6737837113780031, + "learning_rate": 6.61505676467842e-07, + "loss": 1.5322, + "step": 2282 + }, + { + "epoch": 0.15907744835034665, + "grad_norm": 0.7578826835208271, + "learning_rate": 6.614726370332546e-07, + "loss": 1.5239, + "step": 2283 + }, + { + "epoch": 0.15914712747796397, + "grad_norm": 0.707541021895692, + "learning_rate": 6.614395843495308e-07, + "loss": 1.4852, + "step": 2284 + }, + { + "epoch": 0.1592168066055813, + "grad_norm": 0.7077574571137548, + "learning_rate": 6.614065184182546e-07, + "loss": 1.6096, + "step": 2285 + }, + { + "epoch": 0.15928648573319862, + "grad_norm": 0.7182478632727132, + "learning_rate": 6.613734392410106e-07, + "loss": 1.5395, + "step": 2286 + }, + { + "epoch": 0.15935616486081594, + "grad_norm": 0.7367290616820319, + "learning_rate": 6.61340346819384e-07, + "loss": 1.5181, + "step": 2287 + }, + { + "epoch": 0.15942584398843326, + "grad_norm": 0.7042290216704357, + "learning_rate": 6.613072411549606e-07, + "loss": 1.5792, + "step": 2288 + }, + { + "epoch": 0.15949552311605059, + "grad_norm": 0.7080735049459942, + "learning_rate": 6.61274122249327e-07, + "loss": 1.5474, + "step": 2289 + }, + { + "epoch": 0.1595652022436679, + "grad_norm": 0.8896374246651081, + "learning_rate": 6.612409901040703e-07, + "loss": 1.697, + "step": 2290 + }, + { + "epoch": 0.15963488137128523, + "grad_norm": 0.7105735026902991, + "learning_rate": 6.612078447207782e-07, + "loss": 1.5946, + "step": 2291 + }, + { + "epoch": 0.15970456049890255, + "grad_norm": 0.6961245254676526, + "learning_rate": 6.611746861010392e-07, + "loss": 1.4702, + "step": 2292 + }, + { + "epoch": 0.15977423962651988, + "grad_norm": 0.7613831462750842, + "learning_rate": 6.611415142464423e-07, + "loss": 1.4611, + "step": 2293 + }, + { + "epoch": 0.1598439187541372, + "grad_norm": 0.7766601055554404, + "learning_rate": 6.61108329158577e-07, + "loss": 1.5536, + "step": 2294 + }, + { + "epoch": 0.15991359788175452, + "grad_norm": 0.7216761614181849, + "learning_rate": 6.610751308390338e-07, + "loss": 1.62, + "step": 2295 + }, + { + "epoch": 0.15998327700937184, + "grad_norm": 0.7602123159942696, + "learning_rate": 6.610419192894035e-07, + "loss": 1.5354, + "step": 2296 + }, + { + "epoch": 0.16005295613698917, + "grad_norm": 0.7208680007068906, + "learning_rate": 6.610086945112779e-07, + "loss": 1.507, + "step": 2297 + }, + { + "epoch": 0.1601226352646065, + "grad_norm": 0.7271009203166914, + "learning_rate": 6.609754565062488e-07, + "loss": 1.5454, + "step": 2298 + }, + { + "epoch": 0.1601923143922238, + "grad_norm": 0.7155591566547312, + "learning_rate": 6.609422052759093e-07, + "loss": 1.5072, + "step": 2299 + }, + { + "epoch": 0.16026199351984113, + "grad_norm": 0.7646577990143016, + "learning_rate": 6.60908940821853e-07, + "loss": 1.5958, + "step": 2300 + }, + { + "epoch": 0.16033167264745846, + "grad_norm": 0.8432643486292445, + "learning_rate": 6.608756631456737e-07, + "loss": 1.5313, + "step": 2301 + }, + { + "epoch": 0.16040135177507578, + "grad_norm": 0.7165983860712775, + "learning_rate": 6.608423722489663e-07, + "loss": 1.5866, + "step": 2302 + }, + { + "epoch": 0.1604710309026931, + "grad_norm": 0.6948385437059152, + "learning_rate": 6.608090681333261e-07, + "loss": 1.4906, + "step": 2303 + }, + { + "epoch": 0.16054071003031042, + "grad_norm": 0.6881623948883927, + "learning_rate": 6.607757508003492e-07, + "loss": 1.6308, + "step": 2304 + }, + { + "epoch": 0.16061038915792775, + "grad_norm": 0.7202743271110196, + "learning_rate": 6.60742420251632e-07, + "loss": 1.5586, + "step": 2305 + }, + { + "epoch": 0.16068006828554507, + "grad_norm": 0.728905182320733, + "learning_rate": 6.607090764887721e-07, + "loss": 1.382, + "step": 2306 + }, + { + "epoch": 0.1607497474131624, + "grad_norm": 0.7111857908738513, + "learning_rate": 6.606757195133672e-07, + "loss": 1.5649, + "step": 2307 + }, + { + "epoch": 0.16081942654077971, + "grad_norm": 0.7302731077054829, + "learning_rate": 6.606423493270158e-07, + "loss": 1.5731, + "step": 2308 + }, + { + "epoch": 0.16088910566839704, + "grad_norm": 0.6801391548735751, + "learning_rate": 6.606089659313172e-07, + "loss": 1.5673, + "step": 2309 + }, + { + "epoch": 0.16095878479601436, + "grad_norm": 0.7827230409718977, + "learning_rate": 6.605755693278711e-07, + "loss": 1.502, + "step": 2310 + }, + { + "epoch": 0.16102846392363168, + "grad_norm": 0.7035852984193754, + "learning_rate": 6.605421595182779e-07, + "loss": 1.5612, + "step": 2311 + }, + { + "epoch": 0.161098143051249, + "grad_norm": 0.6825492690800449, + "learning_rate": 6.605087365041389e-07, + "loss": 1.5236, + "step": 2312 + }, + { + "epoch": 0.16116782217886633, + "grad_norm": 0.6942928057626722, + "learning_rate": 6.604753002870555e-07, + "loss": 1.5146, + "step": 2313 + }, + { + "epoch": 0.16123750130648365, + "grad_norm": 0.8494657143717437, + "learning_rate": 6.604418508686302e-07, + "loss": 1.7396, + "step": 2314 + }, + { + "epoch": 0.16130718043410097, + "grad_norm": 0.6673471103730584, + "learning_rate": 6.604083882504659e-07, + "loss": 1.5284, + "step": 2315 + }, + { + "epoch": 0.1613768595617183, + "grad_norm": 0.7528593628272984, + "learning_rate": 6.603749124341663e-07, + "loss": 1.6444, + "step": 2316 + }, + { + "epoch": 0.16144653868933562, + "grad_norm": 0.7210025295412614, + "learning_rate": 6.603414234213357e-07, + "loss": 1.6882, + "step": 2317 + }, + { + "epoch": 0.16151621781695294, + "grad_norm": 0.7069438472104946, + "learning_rate": 6.603079212135785e-07, + "loss": 1.6581, + "step": 2318 + }, + { + "epoch": 0.16158589694457026, + "grad_norm": 0.7138732731517777, + "learning_rate": 6.602744058125009e-07, + "loss": 1.5197, + "step": 2319 + }, + { + "epoch": 0.16165557607218758, + "grad_norm": 0.7168132849247152, + "learning_rate": 6.602408772197084e-07, + "loss": 1.4548, + "step": 2320 + }, + { + "epoch": 0.1617252551998049, + "grad_norm": 0.7326189178797672, + "learning_rate": 6.60207335436808e-07, + "loss": 1.6001, + "step": 2321 + }, + { + "epoch": 0.16179493432742223, + "grad_norm": 0.6927914280769315, + "learning_rate": 6.601737804654071e-07, + "loss": 1.6246, + "step": 2322 + }, + { + "epoch": 0.16186461345503955, + "grad_norm": 0.7548539239584744, + "learning_rate": 6.601402123071138e-07, + "loss": 1.7135, + "step": 2323 + }, + { + "epoch": 0.16193429258265687, + "grad_norm": 0.8231932734792815, + "learning_rate": 6.601066309635366e-07, + "loss": 1.6542, + "step": 2324 + }, + { + "epoch": 0.1620039717102742, + "grad_norm": 0.7076281356457971, + "learning_rate": 6.600730364362849e-07, + "loss": 1.6272, + "step": 2325 + }, + { + "epoch": 0.16207365083789152, + "grad_norm": 0.7152242206236852, + "learning_rate": 6.600394287269687e-07, + "loss": 1.4146, + "step": 2326 + }, + { + "epoch": 0.16214332996550884, + "grad_norm": 0.7079726966566244, + "learning_rate": 6.600058078371983e-07, + "loss": 1.4889, + "step": 2327 + }, + { + "epoch": 0.16221300909312616, + "grad_norm": 0.7254514558992783, + "learning_rate": 6.599721737685849e-07, + "loss": 1.502, + "step": 2328 + }, + { + "epoch": 0.1622826882207435, + "grad_norm": 0.6421208663306851, + "learning_rate": 6.599385265227405e-07, + "loss": 1.4715, + "step": 2329 + }, + { + "epoch": 0.1623523673483608, + "grad_norm": 0.7134578161355969, + "learning_rate": 6.599048661012776e-07, + "loss": 1.4987, + "step": 2330 + }, + { + "epoch": 0.16242204647597813, + "grad_norm": 0.6944470963507515, + "learning_rate": 6.598711925058091e-07, + "loss": 1.518, + "step": 2331 + }, + { + "epoch": 0.16249172560359545, + "grad_norm": 0.7024672276158266, + "learning_rate": 6.598375057379487e-07, + "loss": 1.6774, + "step": 2332 + }, + { + "epoch": 0.16256140473121278, + "grad_norm": 0.7529700252137836, + "learning_rate": 6.598038057993109e-07, + "loss": 1.5766, + "step": 2333 + }, + { + "epoch": 0.1626310838588301, + "grad_norm": 0.7632965108002485, + "learning_rate": 6.597700926915103e-07, + "loss": 1.6279, + "step": 2334 + }, + { + "epoch": 0.16270076298644742, + "grad_norm": 0.7201730099281007, + "learning_rate": 6.59736366416163e-07, + "loss": 1.5469, + "step": 2335 + }, + { + "epoch": 0.16277044211406475, + "grad_norm": 0.6996857758852515, + "learning_rate": 6.59702626974885e-07, + "loss": 1.5442, + "step": 2336 + }, + { + "epoch": 0.16284012124168207, + "grad_norm": 0.6917112721568993, + "learning_rate": 6.596688743692931e-07, + "loss": 1.563, + "step": 2337 + }, + { + "epoch": 0.1629098003692994, + "grad_norm": 0.666913569996407, + "learning_rate": 6.596351086010048e-07, + "loss": 1.638, + "step": 2338 + }, + { + "epoch": 0.16297947949691668, + "grad_norm": 0.6802046539952517, + "learning_rate": 6.596013296716384e-07, + "loss": 1.5565, + "step": 2339 + }, + { + "epoch": 0.163049158624534, + "grad_norm": 0.7063793504298344, + "learning_rate": 6.595675375828124e-07, + "loss": 1.5484, + "step": 2340 + }, + { + "epoch": 0.16311883775215133, + "grad_norm": 0.7332190383926808, + "learning_rate": 6.595337323361465e-07, + "loss": 1.5243, + "step": 2341 + }, + { + "epoch": 0.16318851687976865, + "grad_norm": 0.7329461018523288, + "learning_rate": 6.594999139332605e-07, + "loss": 1.7221, + "step": 2342 + }, + { + "epoch": 0.16325819600738598, + "grad_norm": 0.7419199688652642, + "learning_rate": 6.594660823757749e-07, + "loss": 1.5138, + "step": 2343 + }, + { + "epoch": 0.1633278751350033, + "grad_norm": 0.7639302342058242, + "learning_rate": 6.594322376653114e-07, + "loss": 1.6771, + "step": 2344 + }, + { + "epoch": 0.16339755426262062, + "grad_norm": 0.7472337265405039, + "learning_rate": 6.593983798034915e-07, + "loss": 1.518, + "step": 2345 + }, + { + "epoch": 0.16346723339023794, + "grad_norm": 0.7281142288983273, + "learning_rate": 6.59364508791938e-07, + "loss": 1.5627, + "step": 2346 + }, + { + "epoch": 0.16353691251785527, + "grad_norm": 0.7110147549449994, + "learning_rate": 6.593306246322739e-07, + "loss": 1.5052, + "step": 2347 + }, + { + "epoch": 0.1636065916454726, + "grad_norm": 0.7074364175708336, + "learning_rate": 6.592967273261232e-07, + "loss": 1.5836, + "step": 2348 + }, + { + "epoch": 0.1636762707730899, + "grad_norm": 0.7421192254211886, + "learning_rate": 6.592628168751102e-07, + "loss": 1.5003, + "step": 2349 + }, + { + "epoch": 0.16374594990070723, + "grad_norm": 0.7168279539068398, + "learning_rate": 6.592288932808598e-07, + "loss": 1.5503, + "step": 2350 + }, + { + "epoch": 0.16381562902832456, + "grad_norm": 0.7188929197436819, + "learning_rate": 6.591949565449979e-07, + "loss": 1.5518, + "step": 2351 + }, + { + "epoch": 0.16388530815594188, + "grad_norm": 0.762876427581947, + "learning_rate": 6.591610066691508e-07, + "loss": 1.5161, + "step": 2352 + }, + { + "epoch": 0.1639549872835592, + "grad_norm": 0.764966904869516, + "learning_rate": 6.591270436549456e-07, + "loss": 1.5519, + "step": 2353 + }, + { + "epoch": 0.16402466641117652, + "grad_norm": 0.7005514577699263, + "learning_rate": 6.590930675040095e-07, + "loss": 1.5517, + "step": 2354 + }, + { + "epoch": 0.16409434553879385, + "grad_norm": 0.6770300769048003, + "learning_rate": 6.59059078217971e-07, + "loss": 1.4233, + "step": 2355 + }, + { + "epoch": 0.16416402466641117, + "grad_norm": 0.7150779632036103, + "learning_rate": 6.590250757984588e-07, + "loss": 1.5995, + "step": 2356 + }, + { + "epoch": 0.1642337037940285, + "grad_norm": 0.6993402330838724, + "learning_rate": 6.589910602471023e-07, + "loss": 1.4289, + "step": 2357 + }, + { + "epoch": 0.1643033829216458, + "grad_norm": 0.7129939019203483, + "learning_rate": 6.589570315655318e-07, + "loss": 1.6153, + "step": 2358 + }, + { + "epoch": 0.16437306204926314, + "grad_norm": 0.7046647737874796, + "learning_rate": 6.589229897553779e-07, + "loss": 1.582, + "step": 2359 + }, + { + "epoch": 0.16444274117688046, + "grad_norm": 0.7290829227675538, + "learning_rate": 6.58888934818272e-07, + "loss": 1.4972, + "step": 2360 + }, + { + "epoch": 0.16451242030449778, + "grad_norm": 0.7072365195859416, + "learning_rate": 6.588548667558461e-07, + "loss": 1.4581, + "step": 2361 + }, + { + "epoch": 0.1645820994321151, + "grad_norm": 0.735679978353686, + "learning_rate": 6.588207855697326e-07, + "loss": 1.5999, + "step": 2362 + }, + { + "epoch": 0.16465177855973243, + "grad_norm": 0.7120299813416344, + "learning_rate": 6.587866912615651e-07, + "loss": 1.5704, + "step": 2363 + }, + { + "epoch": 0.16472145768734975, + "grad_norm": 0.6820387705988862, + "learning_rate": 6.587525838329772e-07, + "loss": 1.4342, + "step": 2364 + }, + { + "epoch": 0.16479113681496707, + "grad_norm": 0.7107393107714317, + "learning_rate": 6.587184632856035e-07, + "loss": 1.544, + "step": 2365 + }, + { + "epoch": 0.1648608159425844, + "grad_norm": 0.7535777040315426, + "learning_rate": 6.58684329621079e-07, + "loss": 1.6942, + "step": 2366 + }, + { + "epoch": 0.16493049507020172, + "grad_norm": 0.6771116719963444, + "learning_rate": 6.586501828410397e-07, + "loss": 1.5631, + "step": 2367 + }, + { + "epoch": 0.16500017419781904, + "grad_norm": 0.6617367548227207, + "learning_rate": 6.586160229471219e-07, + "loss": 1.4529, + "step": 2368 + }, + { + "epoch": 0.16506985332543636, + "grad_norm": 0.756471565795342, + "learning_rate": 6.585818499409624e-07, + "loss": 1.6085, + "step": 2369 + }, + { + "epoch": 0.16513953245305368, + "grad_norm": 0.7215067440748905, + "learning_rate": 6.585476638241991e-07, + "loss": 1.5597, + "step": 2370 + }, + { + "epoch": 0.165209211580671, + "grad_norm": 0.6715639533999088, + "learning_rate": 6.585134645984701e-07, + "loss": 1.5579, + "step": 2371 + }, + { + "epoch": 0.16527889070828833, + "grad_norm": 0.7528494030685173, + "learning_rate": 6.584792522654144e-07, + "loss": 1.613, + "step": 2372 + }, + { + "epoch": 0.16534856983590565, + "grad_norm": 0.9070409370704179, + "learning_rate": 6.584450268266715e-07, + "loss": 1.5739, + "step": 2373 + }, + { + "epoch": 0.16541824896352297, + "grad_norm": 0.7127053800060373, + "learning_rate": 6.584107882838815e-07, + "loss": 1.5985, + "step": 2374 + }, + { + "epoch": 0.1654879280911403, + "grad_norm": 0.7285186039275199, + "learning_rate": 6.583765366386853e-07, + "loss": 1.5165, + "step": 2375 + }, + { + "epoch": 0.16555760721875762, + "grad_norm": 0.7571340368115554, + "learning_rate": 6.583422718927242e-07, + "loss": 1.4696, + "step": 2376 + }, + { + "epoch": 0.16562728634637494, + "grad_norm": 0.6658658814069529, + "learning_rate": 6.583079940476402e-07, + "loss": 1.4661, + "step": 2377 + }, + { + "epoch": 0.16569696547399226, + "grad_norm": 0.8030332451545158, + "learning_rate": 6.582737031050761e-07, + "loss": 1.8509, + "step": 2378 + }, + { + "epoch": 0.1657666446016096, + "grad_norm": 0.6966252141405037, + "learning_rate": 6.582393990666751e-07, + "loss": 1.5828, + "step": 2379 + }, + { + "epoch": 0.1658363237292269, + "grad_norm": 0.7364229221914593, + "learning_rate": 6.582050819340812e-07, + "loss": 1.6218, + "step": 2380 + }, + { + "epoch": 0.16590600285684423, + "grad_norm": 0.7196295616760052, + "learning_rate": 6.581707517089389e-07, + "loss": 1.5562, + "step": 2381 + }, + { + "epoch": 0.16597568198446155, + "grad_norm": 0.7378354060426576, + "learning_rate": 6.581364083928932e-07, + "loss": 1.6569, + "step": 2382 + }, + { + "epoch": 0.16604536111207888, + "grad_norm": 0.7020021368288268, + "learning_rate": 6.581020519875903e-07, + "loss": 1.4947, + "step": 2383 + }, + { + "epoch": 0.1661150402396962, + "grad_norm": 23.011605268027264, + "learning_rate": 6.580676824946763e-07, + "loss": 1.4042, + "step": 2384 + }, + { + "epoch": 0.16618471936731352, + "grad_norm": 0.7040826666782872, + "learning_rate": 6.580332999157983e-07, + "loss": 1.469, + "step": 2385 + }, + { + "epoch": 0.16625439849493084, + "grad_norm": 0.7218140697486265, + "learning_rate": 6.579989042526041e-07, + "loss": 1.5228, + "step": 2386 + }, + { + "epoch": 0.16632407762254817, + "grad_norm": 0.7123392833850649, + "learning_rate": 6.579644955067421e-07, + "loss": 1.5075, + "step": 2387 + }, + { + "epoch": 0.1663937567501655, + "grad_norm": 0.742613976751209, + "learning_rate": 6.579300736798608e-07, + "loss": 1.6328, + "step": 2388 + }, + { + "epoch": 0.1664634358777828, + "grad_norm": 0.71508688345372, + "learning_rate": 6.578956387736101e-07, + "loss": 1.5266, + "step": 2389 + }, + { + "epoch": 0.16653311500540013, + "grad_norm": 0.7778468846082288, + "learning_rate": 6.578611907896403e-07, + "loss": 1.6486, + "step": 2390 + }, + { + "epoch": 0.16660279413301746, + "grad_norm": 0.6972662449499131, + "learning_rate": 6.57826729729602e-07, + "loss": 1.5245, + "step": 2391 + }, + { + "epoch": 0.16667247326063478, + "grad_norm": 0.6742424513816168, + "learning_rate": 6.577922555951467e-07, + "loss": 1.4659, + "step": 2392 + }, + { + "epoch": 0.1667421523882521, + "grad_norm": 0.7191566959015561, + "learning_rate": 6.577577683879266e-07, + "loss": 1.6706, + "step": 2393 + }, + { + "epoch": 0.16681183151586942, + "grad_norm": 0.742215586917538, + "learning_rate": 6.577232681095941e-07, + "loss": 1.6183, + "step": 2394 + }, + { + "epoch": 0.16688151064348675, + "grad_norm": 0.7459211762633696, + "learning_rate": 6.576887547618028e-07, + "loss": 1.6543, + "step": 2395 + }, + { + "epoch": 0.16695118977110407, + "grad_norm": 0.7406229483744687, + "learning_rate": 6.576542283462065e-07, + "loss": 1.6511, + "step": 2396 + }, + { + "epoch": 0.1670208688987214, + "grad_norm": 0.7041449161282237, + "learning_rate": 6.576196888644599e-07, + "loss": 1.4184, + "step": 2397 + }, + { + "epoch": 0.16709054802633871, + "grad_norm": 0.705938722848296, + "learning_rate": 6.57585136318218e-07, + "loss": 1.5844, + "step": 2398 + }, + { + "epoch": 0.16716022715395604, + "grad_norm": 0.7396198963679226, + "learning_rate": 6.575505707091368e-07, + "loss": 1.5891, + "step": 2399 + }, + { + "epoch": 0.16722990628157336, + "grad_norm": 0.7350092724605711, + "learning_rate": 6.575159920388728e-07, + "loss": 1.6338, + "step": 2400 + }, + { + "epoch": 0.16729958540919068, + "grad_norm": 0.7163445839743655, + "learning_rate": 6.57481400309083e-07, + "loss": 1.5543, + "step": 2401 + }, + { + "epoch": 0.167369264536808, + "grad_norm": 0.7149076387617239, + "learning_rate": 6.574467955214251e-07, + "loss": 1.5448, + "step": 2402 + }, + { + "epoch": 0.16743894366442533, + "grad_norm": 0.7125466696460487, + "learning_rate": 6.574121776775573e-07, + "loss": 1.3838, + "step": 2403 + }, + { + "epoch": 0.16750862279204265, + "grad_norm": 0.7253541486554131, + "learning_rate": 6.573775467791388e-07, + "loss": 1.6541, + "step": 2404 + }, + { + "epoch": 0.16757830191965997, + "grad_norm": 0.6888729373269321, + "learning_rate": 6.57342902827829e-07, + "loss": 1.4966, + "step": 2405 + }, + { + "epoch": 0.1676479810472773, + "grad_norm": 0.7242788546055735, + "learning_rate": 6.573082458252883e-07, + "loss": 1.4608, + "step": 2406 + }, + { + "epoch": 0.16771766017489462, + "grad_norm": 0.719095591694848, + "learning_rate": 6.572735757731774e-07, + "loss": 1.5231, + "step": 2407 + }, + { + "epoch": 0.16778733930251194, + "grad_norm": 0.7062431253120159, + "learning_rate": 6.572388926731578e-07, + "loss": 1.5036, + "step": 2408 + }, + { + "epoch": 0.16785701843012926, + "grad_norm": 0.7380879998828161, + "learning_rate": 6.572041965268916e-07, + "loss": 1.6707, + "step": 2409 + }, + { + "epoch": 0.16792669755774658, + "grad_norm": 0.6863769124791635, + "learning_rate": 6.571694873360414e-07, + "loss": 1.5437, + "step": 2410 + }, + { + "epoch": 0.1679963766853639, + "grad_norm": 0.708848587877779, + "learning_rate": 6.571347651022706e-07, + "loss": 1.5714, + "step": 2411 + }, + { + "epoch": 0.16806605581298123, + "grad_norm": 0.7038223757104983, + "learning_rate": 6.571000298272432e-07, + "loss": 1.5171, + "step": 2412 + }, + { + "epoch": 0.16813573494059855, + "grad_norm": 0.760760658766724, + "learning_rate": 6.570652815126238e-07, + "loss": 1.5314, + "step": 2413 + }, + { + "epoch": 0.16820541406821587, + "grad_norm": 0.7801390262080484, + "learning_rate": 6.570305201600774e-07, + "loss": 1.6455, + "step": 2414 + }, + { + "epoch": 0.1682750931958332, + "grad_norm": 0.6874636309530708, + "learning_rate": 6.569957457712702e-07, + "loss": 1.4671, + "step": 2415 + }, + { + "epoch": 0.16834477232345052, + "grad_norm": 0.7394775403102596, + "learning_rate": 6.569609583478684e-07, + "loss": 1.5806, + "step": 2416 + }, + { + "epoch": 0.16841445145106784, + "grad_norm": 0.7229398082360742, + "learning_rate": 6.56926157891539e-07, + "loss": 1.6871, + "step": 2417 + }, + { + "epoch": 0.16848413057868517, + "grad_norm": 0.6976349597214885, + "learning_rate": 6.568913444039502e-07, + "loss": 1.4284, + "step": 2418 + }, + { + "epoch": 0.1685538097063025, + "grad_norm": 0.7426082913327527, + "learning_rate": 6.568565178867696e-07, + "loss": 1.6139, + "step": 2419 + }, + { + "epoch": 0.1686234888339198, + "grad_norm": 0.7408162730120007, + "learning_rate": 6.568216783416669e-07, + "loss": 1.4382, + "step": 2420 + }, + { + "epoch": 0.16869316796153713, + "grad_norm": 0.7843645939651729, + "learning_rate": 6.567868257703112e-07, + "loss": 1.5234, + "step": 2421 + }, + { + "epoch": 0.16876284708915446, + "grad_norm": 0.7106920389232463, + "learning_rate": 6.567519601743728e-07, + "loss": 1.5792, + "step": 2422 + }, + { + "epoch": 0.16883252621677178, + "grad_norm": 0.7380262499470066, + "learning_rate": 6.567170815555226e-07, + "loss": 1.5791, + "step": 2423 + }, + { + "epoch": 0.1689022053443891, + "grad_norm": 0.7456240829771935, + "learning_rate": 6.56682189915432e-07, + "loss": 1.5671, + "step": 2424 + }, + { + "epoch": 0.16897188447200642, + "grad_norm": 0.7274486962987895, + "learning_rate": 6.566472852557731e-07, + "loss": 1.5414, + "step": 2425 + }, + { + "epoch": 0.16904156359962375, + "grad_norm": 0.7512202796475355, + "learning_rate": 6.566123675782187e-07, + "loss": 1.5159, + "step": 2426 + }, + { + "epoch": 0.16911124272724107, + "grad_norm": 0.709986504523135, + "learning_rate": 6.56577436884442e-07, + "loss": 1.5121, + "step": 2427 + }, + { + "epoch": 0.1691809218548584, + "grad_norm": 0.714474528207041, + "learning_rate": 6.56542493176117e-07, + "loss": 1.4527, + "step": 2428 + }, + { + "epoch": 0.1692506009824757, + "grad_norm": 0.6975956737888491, + "learning_rate": 6.565075364549182e-07, + "loss": 1.5378, + "step": 2429 + }, + { + "epoch": 0.16932028011009304, + "grad_norm": 0.7889336946448815, + "learning_rate": 6.564725667225209e-07, + "loss": 1.5498, + "step": 2430 + }, + { + "epoch": 0.16938995923771033, + "grad_norm": 0.7181576447687493, + "learning_rate": 6.564375839806009e-07, + "loss": 1.6174, + "step": 2431 + }, + { + "epoch": 0.16945963836532765, + "grad_norm": 0.7157683881150451, + "learning_rate": 6.564025882308345e-07, + "loss": 1.5108, + "step": 2432 + }, + { + "epoch": 0.16952931749294498, + "grad_norm": 0.719495796453952, + "learning_rate": 6.56367579474899e-07, + "loss": 1.5718, + "step": 2433 + }, + { + "epoch": 0.1695989966205623, + "grad_norm": 0.7061695225090542, + "learning_rate": 6.56332557714472e-07, + "loss": 1.4915, + "step": 2434 + }, + { + "epoch": 0.16966867574817962, + "grad_norm": 0.7577425760099268, + "learning_rate": 6.562975229512317e-07, + "loss": 1.5643, + "step": 2435 + }, + { + "epoch": 0.16973835487579694, + "grad_norm": 0.6952801720982502, + "learning_rate": 6.562624751868573e-07, + "loss": 1.49, + "step": 2436 + }, + { + "epoch": 0.16980803400341427, + "grad_norm": 0.7197087596785499, + "learning_rate": 6.562274144230281e-07, + "loss": 1.6057, + "step": 2437 + }, + { + "epoch": 0.1698777131310316, + "grad_norm": 0.6958464094196392, + "learning_rate": 6.561923406614243e-07, + "loss": 1.5574, + "step": 2438 + }, + { + "epoch": 0.1699473922586489, + "grad_norm": 0.7366661054000561, + "learning_rate": 6.56157253903727e-07, + "loss": 1.5172, + "step": 2439 + }, + { + "epoch": 0.17001707138626623, + "grad_norm": 0.6770931017295103, + "learning_rate": 6.561221541516173e-07, + "loss": 1.5862, + "step": 2440 + }, + { + "epoch": 0.17008675051388356, + "grad_norm": 0.7110420060229062, + "learning_rate": 6.560870414067773e-07, + "loss": 1.597, + "step": 2441 + }, + { + "epoch": 0.17015642964150088, + "grad_norm": 0.7620049791494978, + "learning_rate": 6.560519156708898e-07, + "loss": 1.6246, + "step": 2442 + }, + { + "epoch": 0.1702261087691182, + "grad_norm": 0.7140259867968641, + "learning_rate": 6.560167769456381e-07, + "loss": 1.5858, + "step": 2443 + }, + { + "epoch": 0.17029578789673552, + "grad_norm": 0.7617281906746146, + "learning_rate": 6.559816252327059e-07, + "loss": 1.6849, + "step": 2444 + }, + { + "epoch": 0.17036546702435285, + "grad_norm": 0.7099678487031629, + "learning_rate": 6.55946460533778e-07, + "loss": 1.5603, + "step": 2445 + }, + { + "epoch": 0.17043514615197017, + "grad_norm": 0.8066709045974286, + "learning_rate": 6.559112828505397e-07, + "loss": 1.6058, + "step": 2446 + }, + { + "epoch": 0.1705048252795875, + "grad_norm": 0.7135349848853826, + "learning_rate": 6.558760921846762e-07, + "loss": 1.6007, + "step": 2447 + }, + { + "epoch": 0.1705745044072048, + "grad_norm": 0.7441499343871727, + "learning_rate": 6.558408885378744e-07, + "loss": 1.5822, + "step": 2448 + }, + { + "epoch": 0.17064418353482214, + "grad_norm": 0.7321585891025582, + "learning_rate": 6.558056719118212e-07, + "loss": 1.6208, + "step": 2449 + }, + { + "epoch": 0.17071386266243946, + "grad_norm": 0.7988542772381881, + "learning_rate": 6.557704423082042e-07, + "loss": 1.6834, + "step": 2450 + }, + { + "epoch": 0.17078354179005678, + "grad_norm": 0.7160620851253273, + "learning_rate": 6.557351997287118e-07, + "loss": 1.5184, + "step": 2451 + }, + { + "epoch": 0.1708532209176741, + "grad_norm": 0.6947991412978013, + "learning_rate": 6.556999441750328e-07, + "loss": 1.547, + "step": 2452 + }, + { + "epoch": 0.17092290004529143, + "grad_norm": 0.7031959563327964, + "learning_rate": 6.556646756488567e-07, + "loss": 1.4695, + "step": 2453 + }, + { + "epoch": 0.17099257917290875, + "grad_norm": 0.7041369358140038, + "learning_rate": 6.556293941518737e-07, + "loss": 1.5845, + "step": 2454 + }, + { + "epoch": 0.17106225830052607, + "grad_norm": 0.7228048984189448, + "learning_rate": 6.555940996857746e-07, + "loss": 1.5521, + "step": 2455 + }, + { + "epoch": 0.1711319374281434, + "grad_norm": 0.763823425561266, + "learning_rate": 6.555587922522507e-07, + "loss": 1.514, + "step": 2456 + }, + { + "epoch": 0.17120161655576072, + "grad_norm": 0.7060298331702053, + "learning_rate": 6.55523471852994e-07, + "loss": 1.6084, + "step": 2457 + }, + { + "epoch": 0.17127129568337804, + "grad_norm": 0.7791863111355747, + "learning_rate": 6.554881384896971e-07, + "loss": 1.6451, + "step": 2458 + }, + { + "epoch": 0.17134097481099536, + "grad_norm": 0.7548442713753598, + "learning_rate": 6.554527921640534e-07, + "loss": 1.5738, + "step": 2459 + }, + { + "epoch": 0.17141065393861268, + "grad_norm": 0.7318943333159449, + "learning_rate": 6.554174328777566e-07, + "loss": 1.5405, + "step": 2460 + }, + { + "epoch": 0.17148033306623, + "grad_norm": 0.7107952467450837, + "learning_rate": 6.553820606325013e-07, + "loss": 1.5816, + "step": 2461 + }, + { + "epoch": 0.17155001219384733, + "grad_norm": 0.7148920719538931, + "learning_rate": 6.553466754299825e-07, + "loss": 1.4621, + "step": 2462 + }, + { + "epoch": 0.17161969132146465, + "grad_norm": 15.284258988235981, + "learning_rate": 6.553112772718961e-07, + "loss": 1.4982, + "step": 2463 + }, + { + "epoch": 0.17168937044908197, + "grad_norm": 0.7606998847724221, + "learning_rate": 6.552758661599384e-07, + "loss": 1.505, + "step": 2464 + }, + { + "epoch": 0.1717590495766993, + "grad_norm": 0.737717860072526, + "learning_rate": 6.552404420958061e-07, + "loss": 1.6274, + "step": 2465 + }, + { + "epoch": 0.17182872870431662, + "grad_norm": 0.6936147594438409, + "learning_rate": 6.552050050811973e-07, + "loss": 1.4483, + "step": 2466 + }, + { + "epoch": 0.17189840783193394, + "grad_norm": 0.6821739277267969, + "learning_rate": 6.551695551178097e-07, + "loss": 1.5747, + "step": 2467 + }, + { + "epoch": 0.17196808695955126, + "grad_norm": 0.7209446081125167, + "learning_rate": 6.551340922073425e-07, + "loss": 1.6935, + "step": 2468 + }, + { + "epoch": 0.1720377660871686, + "grad_norm": 0.706917031810206, + "learning_rate": 6.55098616351495e-07, + "loss": 1.5429, + "step": 2469 + }, + { + "epoch": 0.1721074452147859, + "grad_norm": 0.6657369029274698, + "learning_rate": 6.550631275519674e-07, + "loss": 1.6344, + "step": 2470 + }, + { + "epoch": 0.17217712434240323, + "grad_norm": 0.7300297982250075, + "learning_rate": 6.550276258104601e-07, + "loss": 1.7434, + "step": 2471 + }, + { + "epoch": 0.17224680347002055, + "grad_norm": 0.7218093987525189, + "learning_rate": 6.549921111286748e-07, + "loss": 1.5561, + "step": 2472 + }, + { + "epoch": 0.17231648259763788, + "grad_norm": 0.7178449444982237, + "learning_rate": 6.549565835083131e-07, + "loss": 1.4582, + "step": 2473 + }, + { + "epoch": 0.1723861617252552, + "grad_norm": 0.7341473296783434, + "learning_rate": 6.549210429510778e-07, + "loss": 1.5608, + "step": 2474 + }, + { + "epoch": 0.17245584085287252, + "grad_norm": 0.7524545461753286, + "learning_rate": 6.548854894586719e-07, + "loss": 1.5571, + "step": 2475 + }, + { + "epoch": 0.17252551998048984, + "grad_norm": 0.7265542621338041, + "learning_rate": 6.548499230327993e-07, + "loss": 1.524, + "step": 2476 + }, + { + "epoch": 0.17259519910810717, + "grad_norm": 0.7105786157504762, + "learning_rate": 6.548143436751646e-07, + "loss": 1.4915, + "step": 2477 + }, + { + "epoch": 0.1726648782357245, + "grad_norm": 0.7174301051303112, + "learning_rate": 6.547787513874723e-07, + "loss": 1.4659, + "step": 2478 + }, + { + "epoch": 0.1727345573633418, + "grad_norm": 0.7503192380533867, + "learning_rate": 6.547431461714286e-07, + "loss": 1.5658, + "step": 2479 + }, + { + "epoch": 0.17280423649095913, + "grad_norm": 0.6907154911256304, + "learning_rate": 6.547075280287396e-07, + "loss": 1.5539, + "step": 2480 + }, + { + "epoch": 0.17287391561857646, + "grad_norm": 0.7794515823710214, + "learning_rate": 6.546718969611121e-07, + "loss": 1.7881, + "step": 2481 + }, + { + "epoch": 0.17294359474619378, + "grad_norm": 0.6603868909619415, + "learning_rate": 6.546362529702536e-07, + "loss": 1.4557, + "step": 2482 + }, + { + "epoch": 0.1730132738738111, + "grad_norm": 0.7586159790450311, + "learning_rate": 6.546005960578724e-07, + "loss": 1.5587, + "step": 2483 + }, + { + "epoch": 0.17308295300142842, + "grad_norm": 0.7404883917977331, + "learning_rate": 6.545649262256771e-07, + "loss": 1.483, + "step": 2484 + }, + { + "epoch": 0.17315263212904575, + "grad_norm": 0.7526067704071594, + "learning_rate": 6.545292434753772e-07, + "loss": 1.4999, + "step": 2485 + }, + { + "epoch": 0.17322231125666307, + "grad_norm": 0.799844700462213, + "learning_rate": 6.544935478086825e-07, + "loss": 1.6473, + "step": 2486 + }, + { + "epoch": 0.1732919903842804, + "grad_norm": 0.7884391841826116, + "learning_rate": 6.544578392273038e-07, + "loss": 1.5843, + "step": 2487 + }, + { + "epoch": 0.17336166951189771, + "grad_norm": 0.722546423895997, + "learning_rate": 6.544221177329522e-07, + "loss": 1.5647, + "step": 2488 + }, + { + "epoch": 0.17343134863951504, + "grad_norm": 0.6995711761145036, + "learning_rate": 6.543863833273397e-07, + "loss": 1.4946, + "step": 2489 + }, + { + "epoch": 0.17350102776713236, + "grad_norm": 0.7608256105047294, + "learning_rate": 6.543506360121787e-07, + "loss": 1.5278, + "step": 2490 + }, + { + "epoch": 0.17357070689474968, + "grad_norm": 0.7060114399131842, + "learning_rate": 6.543148757891821e-07, + "loss": 1.4342, + "step": 2491 + }, + { + "epoch": 0.173640386022367, + "grad_norm": 0.7038526017797239, + "learning_rate": 6.542791026600639e-07, + "loss": 1.5103, + "step": 2492 + }, + { + "epoch": 0.17371006514998433, + "grad_norm": 0.7454708545350585, + "learning_rate": 6.542433166265382e-07, + "loss": 1.5699, + "step": 2493 + }, + { + "epoch": 0.17377974427760165, + "grad_norm": 0.7241434175996339, + "learning_rate": 6.5420751769032e-07, + "loss": 1.5849, + "step": 2494 + }, + { + "epoch": 0.17384942340521897, + "grad_norm": 0.6828214180093903, + "learning_rate": 6.54171705853125e-07, + "loss": 1.5575, + "step": 2495 + }, + { + "epoch": 0.1739191025328363, + "grad_norm": 0.7645534413737075, + "learning_rate": 6.541358811166691e-07, + "loss": 1.7047, + "step": 2496 + }, + { + "epoch": 0.17398878166045362, + "grad_norm": 0.7599757807395299, + "learning_rate": 6.541000434826694e-07, + "loss": 1.5909, + "step": 2497 + }, + { + "epoch": 0.17405846078807094, + "grad_norm": 0.8077318342213113, + "learning_rate": 6.540641929528431e-07, + "loss": 1.6113, + "step": 2498 + }, + { + "epoch": 0.17412813991568826, + "grad_norm": 0.7421508001687054, + "learning_rate": 6.540283295289082e-07, + "loss": 1.5622, + "step": 2499 + }, + { + "epoch": 0.17419781904330559, + "grad_norm": 0.7088253814000094, + "learning_rate": 6.539924532125835e-07, + "loss": 1.6426, + "step": 2500 + }, + { + "epoch": 0.1742674981709229, + "grad_norm": 0.7474674019877895, + "learning_rate": 6.539565640055884e-07, + "loss": 1.5854, + "step": 2501 + }, + { + "epoch": 0.17433717729854023, + "grad_norm": 0.7365714423846182, + "learning_rate": 6.539206619096424e-07, + "loss": 1.6054, + "step": 2502 + }, + { + "epoch": 0.17440685642615755, + "grad_norm": 0.7123103017402581, + "learning_rate": 6.538847469264663e-07, + "loss": 1.6297, + "step": 2503 + }, + { + "epoch": 0.17447653555377488, + "grad_norm": 0.7224845735928211, + "learning_rate": 6.53848819057781e-07, + "loss": 1.6559, + "step": 2504 + }, + { + "epoch": 0.1745462146813922, + "grad_norm": 0.6926473364257885, + "learning_rate": 6.538128783053084e-07, + "loss": 1.6328, + "step": 2505 + }, + { + "epoch": 0.17461589380900952, + "grad_norm": 0.7662335923901936, + "learning_rate": 6.537769246707708e-07, + "loss": 1.405, + "step": 2506 + }, + { + "epoch": 0.17468557293662684, + "grad_norm": 0.6971301204534728, + "learning_rate": 6.537409581558911e-07, + "loss": 1.4508, + "step": 2507 + }, + { + "epoch": 0.17475525206424417, + "grad_norm": 0.7439320230916382, + "learning_rate": 6.537049787623931e-07, + "loss": 1.5743, + "step": 2508 + }, + { + "epoch": 0.1748249311918615, + "grad_norm": 0.7336682024026748, + "learning_rate": 6.536689864920009e-07, + "loss": 1.6968, + "step": 2509 + }, + { + "epoch": 0.1748946103194788, + "grad_norm": 0.7449360351785614, + "learning_rate": 6.536329813464393e-07, + "loss": 1.6851, + "step": 2510 + }, + { + "epoch": 0.17496428944709613, + "grad_norm": 0.7593962246800922, + "learning_rate": 6.535969633274337e-07, + "loss": 1.5848, + "step": 2511 + }, + { + "epoch": 0.17503396857471346, + "grad_norm": 0.6892811934428639, + "learning_rate": 6.535609324367102e-07, + "loss": 1.5004, + "step": 2512 + }, + { + "epoch": 0.17510364770233078, + "grad_norm": 0.7357891038175344, + "learning_rate": 6.535248886759954e-07, + "loss": 1.5088, + "step": 2513 + }, + { + "epoch": 0.1751733268299481, + "grad_norm": 0.7200528123000481, + "learning_rate": 6.534888320470168e-07, + "loss": 1.5738, + "step": 2514 + }, + { + "epoch": 0.17524300595756542, + "grad_norm": 0.6873780049343139, + "learning_rate": 6.53452762551502e-07, + "loss": 1.4716, + "step": 2515 + }, + { + "epoch": 0.17531268508518275, + "grad_norm": 0.724561950813103, + "learning_rate": 6.534166801911799e-07, + "loss": 1.434, + "step": 2516 + }, + { + "epoch": 0.17538236421280007, + "grad_norm": 0.7489153459676924, + "learning_rate": 6.533805849677792e-07, + "loss": 1.4961, + "step": 2517 + }, + { + "epoch": 0.1754520433404174, + "grad_norm": 0.7401673701928653, + "learning_rate": 6.533444768830302e-07, + "loss": 1.5407, + "step": 2518 + }, + { + "epoch": 0.1755217224680347, + "grad_norm": 0.6934330185732609, + "learning_rate": 6.533083559386627e-07, + "loss": 1.4262, + "step": 2519 + }, + { + "epoch": 0.17559140159565204, + "grad_norm": 0.7294097563407264, + "learning_rate": 6.532722221364081e-07, + "loss": 1.6563, + "step": 2520 + }, + { + "epoch": 0.17566108072326936, + "grad_norm": 0.6607520492198753, + "learning_rate": 6.532360754779977e-07, + "loss": 1.4277, + "step": 2521 + }, + { + "epoch": 0.17573075985088665, + "grad_norm": 0.734449690451649, + "learning_rate": 6.53199915965164e-07, + "loss": 1.5041, + "step": 2522 + }, + { + "epoch": 0.17580043897850398, + "grad_norm": 0.7278895608033406, + "learning_rate": 6.531637435996397e-07, + "loss": 1.5348, + "step": 2523 + }, + { + "epoch": 0.1758701181061213, + "grad_norm": 0.6880733498662125, + "learning_rate": 6.531275583831584e-07, + "loss": 1.5598, + "step": 2524 + }, + { + "epoch": 0.17593979723373862, + "grad_norm": 0.7219542430932584, + "learning_rate": 6.530913603174539e-07, + "loss": 1.6169, + "step": 2525 + }, + { + "epoch": 0.17600947636135594, + "grad_norm": 0.6768504677014932, + "learning_rate": 6.530551494042611e-07, + "loss": 1.3468, + "step": 2526 + }, + { + "epoch": 0.17607915548897327, + "grad_norm": 0.7628140093864075, + "learning_rate": 6.530189256453151e-07, + "loss": 1.4331, + "step": 2527 + }, + { + "epoch": 0.1761488346165906, + "grad_norm": 0.6937139477500073, + "learning_rate": 6.529826890423521e-07, + "loss": 1.5771, + "step": 2528 + }, + { + "epoch": 0.1762185137442079, + "grad_norm": 0.7446668334736429, + "learning_rate": 6.529464395971085e-07, + "loss": 1.5671, + "step": 2529 + }, + { + "epoch": 0.17628819287182523, + "grad_norm": 0.8951513009342587, + "learning_rate": 6.529101773113213e-07, + "loss": 1.6593, + "step": 2530 + }, + { + "epoch": 0.17635787199944256, + "grad_norm": 0.7501370933617739, + "learning_rate": 6.528739021867285e-07, + "loss": 1.5608, + "step": 2531 + }, + { + "epoch": 0.17642755112705988, + "grad_norm": 0.7259327370195917, + "learning_rate": 6.528376142250684e-07, + "loss": 1.6285, + "step": 2532 + }, + { + "epoch": 0.1764972302546772, + "grad_norm": 0.6513213101252865, + "learning_rate": 6.528013134280799e-07, + "loss": 1.4204, + "step": 2533 + }, + { + "epoch": 0.17656690938229452, + "grad_norm": 0.6998756742851133, + "learning_rate": 6.527649997975026e-07, + "loss": 1.495, + "step": 2534 + }, + { + "epoch": 0.17663658850991185, + "grad_norm": 0.7206725602051455, + "learning_rate": 6.52728673335077e-07, + "loss": 1.6582, + "step": 2535 + }, + { + "epoch": 0.17670626763752917, + "grad_norm": 0.725602973875328, + "learning_rate": 6.526923340425437e-07, + "loss": 1.5383, + "step": 2536 + }, + { + "epoch": 0.1767759467651465, + "grad_norm": 0.7314479526711624, + "learning_rate": 6.52655981921644e-07, + "loss": 1.378, + "step": 2537 + }, + { + "epoch": 0.1768456258927638, + "grad_norm": 0.7116441177034473, + "learning_rate": 6.526196169741202e-07, + "loss": 1.4627, + "step": 2538 + }, + { + "epoch": 0.17691530502038114, + "grad_norm": 0.7290784007685396, + "learning_rate": 6.525832392017151e-07, + "loss": 1.5596, + "step": 2539 + }, + { + "epoch": 0.17698498414799846, + "grad_norm": 0.6934353435568296, + "learning_rate": 6.525468486061717e-07, + "loss": 1.4273, + "step": 2540 + }, + { + "epoch": 0.17705466327561578, + "grad_norm": 0.7232881714758217, + "learning_rate": 6.525104451892341e-07, + "loss": 1.5556, + "step": 2541 + }, + { + "epoch": 0.1771243424032331, + "grad_norm": 0.7330226056875261, + "learning_rate": 6.524740289526469e-07, + "loss": 1.5676, + "step": 2542 + }, + { + "epoch": 0.17719402153085043, + "grad_norm": 0.6942435985096442, + "learning_rate": 6.524375998981549e-07, + "loss": 1.4992, + "step": 2543 + }, + { + "epoch": 0.17726370065846775, + "grad_norm": 0.7087192076326799, + "learning_rate": 6.524011580275041e-07, + "loss": 1.5234, + "step": 2544 + }, + { + "epoch": 0.17733337978608507, + "grad_norm": 0.6906658016292048, + "learning_rate": 6.52364703342441e-07, + "loss": 1.5446, + "step": 2545 + }, + { + "epoch": 0.1774030589137024, + "grad_norm": 0.6590162253016592, + "learning_rate": 6.523282358447123e-07, + "loss": 1.4942, + "step": 2546 + }, + { + "epoch": 0.17747273804131972, + "grad_norm": 0.7374886336643569, + "learning_rate": 6.522917555360658e-07, + "loss": 1.6685, + "step": 2547 + }, + { + "epoch": 0.17754241716893704, + "grad_norm": 0.7288316061604521, + "learning_rate": 6.522552624182495e-07, + "loss": 1.5612, + "step": 2548 + }, + { + "epoch": 0.17761209629655436, + "grad_norm": 0.7986604299216298, + "learning_rate": 6.522187564930125e-07, + "loss": 1.5161, + "step": 2549 + }, + { + "epoch": 0.17768177542417168, + "grad_norm": 0.7490611352894104, + "learning_rate": 6.521822377621042e-07, + "loss": 1.571, + "step": 2550 + }, + { + "epoch": 0.177751454551789, + "grad_norm": 0.7096302346445801, + "learning_rate": 6.521457062272743e-07, + "loss": 1.4839, + "step": 2551 + }, + { + "epoch": 0.17782113367940633, + "grad_norm": 0.7300140825304802, + "learning_rate": 6.521091618902738e-07, + "loss": 1.615, + "step": 2552 + }, + { + "epoch": 0.17789081280702365, + "grad_norm": 0.6966168480177054, + "learning_rate": 6.520726047528539e-07, + "loss": 1.5791, + "step": 2553 + }, + { + "epoch": 0.17796049193464097, + "grad_norm": 0.7468143622329642, + "learning_rate": 6.520360348167666e-07, + "loss": 1.5228, + "step": 2554 + }, + { + "epoch": 0.1780301710622583, + "grad_norm": 0.6872146413398952, + "learning_rate": 6.519994520837641e-07, + "loss": 1.4354, + "step": 2555 + }, + { + "epoch": 0.17809985018987562, + "grad_norm": 0.7369132879346765, + "learning_rate": 6.519628565555998e-07, + "loss": 1.564, + "step": 2556 + }, + { + "epoch": 0.17816952931749294, + "grad_norm": 0.7484987827082581, + "learning_rate": 6.519262482340275e-07, + "loss": 1.4987, + "step": 2557 + }, + { + "epoch": 0.17823920844511026, + "grad_norm": 0.7590198399272611, + "learning_rate": 6.518896271208012e-07, + "loss": 1.7448, + "step": 2558 + }, + { + "epoch": 0.1783088875727276, + "grad_norm": 0.6911442734510137, + "learning_rate": 6.518529932176761e-07, + "loss": 1.6123, + "step": 2559 + }, + { + "epoch": 0.1783785667003449, + "grad_norm": 0.6705004696348974, + "learning_rate": 6.518163465264078e-07, + "loss": 1.5614, + "step": 2560 + }, + { + "epoch": 0.17844824582796223, + "grad_norm": 0.750693147635284, + "learning_rate": 6.517796870487524e-07, + "loss": 1.5328, + "step": 2561 + }, + { + "epoch": 0.17851792495557955, + "grad_norm": 0.6856127610822711, + "learning_rate": 6.517430147864667e-07, + "loss": 1.48, + "step": 2562 + }, + { + "epoch": 0.17858760408319688, + "grad_norm": 0.7275820774071743, + "learning_rate": 6.51706329741308e-07, + "loss": 1.5447, + "step": 2563 + }, + { + "epoch": 0.1786572832108142, + "grad_norm": 0.6711195393492667, + "learning_rate": 6.516696319150345e-07, + "loss": 1.5349, + "step": 2564 + }, + { + "epoch": 0.17872696233843152, + "grad_norm": 0.7017299635771622, + "learning_rate": 6.51632921309405e-07, + "loss": 1.4836, + "step": 2565 + }, + { + "epoch": 0.17879664146604884, + "grad_norm": 0.6998012714366909, + "learning_rate": 6.515961979261782e-07, + "loss": 1.4723, + "step": 2566 + }, + { + "epoch": 0.17886632059366617, + "grad_norm": 0.7104918945327922, + "learning_rate": 6.515594617671145e-07, + "loss": 1.582, + "step": 2567 + }, + { + "epoch": 0.1789359997212835, + "grad_norm": 0.6731653151031656, + "learning_rate": 6.515227128339741e-07, + "loss": 1.5919, + "step": 2568 + }, + { + "epoch": 0.1790056788489008, + "grad_norm": 0.703887895548082, + "learning_rate": 6.51485951128518e-07, + "loss": 1.5629, + "step": 2569 + }, + { + "epoch": 0.17907535797651813, + "grad_norm": 0.7025048472660292, + "learning_rate": 6.514491766525082e-07, + "loss": 1.5729, + "step": 2570 + }, + { + "epoch": 0.17914503710413546, + "grad_norm": 0.6551133693668119, + "learning_rate": 6.514123894077068e-07, + "loss": 1.5205, + "step": 2571 + }, + { + "epoch": 0.17921471623175278, + "grad_norm": 0.7821354033326472, + "learning_rate": 6.513755893958768e-07, + "loss": 1.5875, + "step": 2572 + }, + { + "epoch": 0.1792843953593701, + "grad_norm": 0.7426558766874091, + "learning_rate": 6.513387766187816e-07, + "loss": 1.5528, + "step": 2573 + }, + { + "epoch": 0.17935407448698742, + "grad_norm": 0.7372215085776161, + "learning_rate": 6.513019510781856e-07, + "loss": 1.6203, + "step": 2574 + }, + { + "epoch": 0.17942375361460475, + "grad_norm": 0.7242908564459493, + "learning_rate": 6.512651127758533e-07, + "loss": 1.564, + "step": 2575 + }, + { + "epoch": 0.17949343274222207, + "grad_norm": 0.7324473423283194, + "learning_rate": 6.512282617135501e-07, + "loss": 1.5448, + "step": 2576 + }, + { + "epoch": 0.1795631118698394, + "grad_norm": 0.6708021000359229, + "learning_rate": 6.511913978930421e-07, + "loss": 1.4548, + "step": 2577 + }, + { + "epoch": 0.17963279099745671, + "grad_norm": 0.7622906166024526, + "learning_rate": 6.511545213160959e-07, + "loss": 1.72, + "step": 2578 + }, + { + "epoch": 0.17970247012507404, + "grad_norm": 0.8097203764903231, + "learning_rate": 6.511176319844786e-07, + "loss": 1.5922, + "step": 2579 + }, + { + "epoch": 0.17977214925269136, + "grad_norm": 0.7167045481979863, + "learning_rate": 6.51080729899958e-07, + "loss": 1.5534, + "step": 2580 + }, + { + "epoch": 0.17984182838030868, + "grad_norm": 0.7130025941415435, + "learning_rate": 6.510438150643025e-07, + "loss": 1.4931, + "step": 2581 + }, + { + "epoch": 0.179911507507926, + "grad_norm": 0.7206872658927816, + "learning_rate": 6.510068874792813e-07, + "loss": 1.549, + "step": 2582 + }, + { + "epoch": 0.17998118663554333, + "grad_norm": 0.6855445976689892, + "learning_rate": 6.50969947146664e-07, + "loss": 1.5097, + "step": 2583 + }, + { + "epoch": 0.18005086576316065, + "grad_norm": 0.8358535909689165, + "learning_rate": 6.509329940682207e-07, + "loss": 1.6702, + "step": 2584 + }, + { + "epoch": 0.18012054489077797, + "grad_norm": 0.7251461113964899, + "learning_rate": 6.508960282457224e-07, + "loss": 1.5584, + "step": 2585 + }, + { + "epoch": 0.1801902240183953, + "grad_norm": 0.731726399487568, + "learning_rate": 6.508590496809407e-07, + "loss": 1.6408, + "step": 2586 + }, + { + "epoch": 0.18025990314601262, + "grad_norm": 0.771355055088224, + "learning_rate": 6.508220583756474e-07, + "loss": 1.6582, + "step": 2587 + }, + { + "epoch": 0.18032958227362994, + "grad_norm": 0.6991176945437412, + "learning_rate": 6.507850543316153e-07, + "loss": 1.5338, + "step": 2588 + }, + { + "epoch": 0.18039926140124726, + "grad_norm": 0.6942418119782533, + "learning_rate": 6.507480375506177e-07, + "loss": 1.6089, + "step": 2589 + }, + { + "epoch": 0.18046894052886459, + "grad_norm": 0.7744690223635904, + "learning_rate": 6.507110080344287e-07, + "loss": 1.6037, + "step": 2590 + }, + { + "epoch": 0.1805386196564819, + "grad_norm": 0.748844004383975, + "learning_rate": 6.506739657848227e-07, + "loss": 1.5866, + "step": 2591 + }, + { + "epoch": 0.18060829878409923, + "grad_norm": 0.7649898548673842, + "learning_rate": 6.506369108035749e-07, + "loss": 1.5733, + "step": 2592 + }, + { + "epoch": 0.18067797791171655, + "grad_norm": 0.7134369595094384, + "learning_rate": 6.505998430924608e-07, + "loss": 1.515, + "step": 2593 + }, + { + "epoch": 0.18074765703933388, + "grad_norm": 0.678241174097191, + "learning_rate": 6.505627626532572e-07, + "loss": 1.5061, + "step": 2594 + }, + { + "epoch": 0.1808173361669512, + "grad_norm": 0.7642942419987621, + "learning_rate": 6.505256694877406e-07, + "loss": 1.5778, + "step": 2595 + }, + { + "epoch": 0.18088701529456852, + "grad_norm": 0.7179287288029527, + "learning_rate": 6.50488563597689e-07, + "loss": 1.6228, + "step": 2596 + }, + { + "epoch": 0.18095669442218584, + "grad_norm": 0.7435897191353903, + "learning_rate": 6.504514449848803e-07, + "loss": 1.5228, + "step": 2597 + }, + { + "epoch": 0.18102637354980317, + "grad_norm": 0.7270413279809006, + "learning_rate": 6.504143136510932e-07, + "loss": 1.5702, + "step": 2598 + }, + { + "epoch": 0.1810960526774205, + "grad_norm": 0.6872507120511585, + "learning_rate": 6.503771695981076e-07, + "loss": 1.4811, + "step": 2599 + }, + { + "epoch": 0.1811657318050378, + "grad_norm": 0.7803447095575899, + "learning_rate": 6.503400128277032e-07, + "loss": 1.4736, + "step": 2600 + }, + { + "epoch": 0.18123541093265513, + "grad_norm": 0.7177509621225837, + "learning_rate": 6.503028433416605e-07, + "loss": 1.5244, + "step": 2601 + }, + { + "epoch": 0.18130509006027246, + "grad_norm": 0.7965152976759453, + "learning_rate": 6.502656611417609e-07, + "loss": 1.4269, + "step": 2602 + }, + { + "epoch": 0.18137476918788978, + "grad_norm": 0.7101628004895346, + "learning_rate": 6.502284662297863e-07, + "loss": 1.5922, + "step": 2603 + }, + { + "epoch": 0.1814444483155071, + "grad_norm": 0.6982929845687866, + "learning_rate": 6.50191258607519e-07, + "loss": 1.5254, + "step": 2604 + }, + { + "epoch": 0.18151412744312442, + "grad_norm": 0.6950322548952821, + "learning_rate": 6.501540382767424e-07, + "loss": 1.4629, + "step": 2605 + }, + { + "epoch": 0.18158380657074175, + "grad_norm": 0.7202886551001328, + "learning_rate": 6.501168052392397e-07, + "loss": 1.552, + "step": 2606 + }, + { + "epoch": 0.18165348569835907, + "grad_norm": 0.7161505166062673, + "learning_rate": 6.500795594967954e-07, + "loss": 1.6631, + "step": 2607 + }, + { + "epoch": 0.1817231648259764, + "grad_norm": 0.6878116644197528, + "learning_rate": 6.500423010511946e-07, + "loss": 1.4918, + "step": 2608 + }, + { + "epoch": 0.1817928439535937, + "grad_norm": 0.7353866773063502, + "learning_rate": 6.500050299042224e-07, + "loss": 1.554, + "step": 2609 + }, + { + "epoch": 0.18186252308121104, + "grad_norm": 0.7164396573550172, + "learning_rate": 6.499677460576653e-07, + "loss": 1.4531, + "step": 2610 + }, + { + "epoch": 0.18193220220882836, + "grad_norm": 0.7912418932841199, + "learning_rate": 6.499304495133098e-07, + "loss": 1.5789, + "step": 2611 + }, + { + "epoch": 0.18200188133644568, + "grad_norm": 0.763240467980432, + "learning_rate": 6.498931402729434e-07, + "loss": 1.5829, + "step": 2612 + }, + { + "epoch": 0.182071560464063, + "grad_norm": 0.7439100105793851, + "learning_rate": 6.498558183383537e-07, + "loss": 1.5418, + "step": 2613 + }, + { + "epoch": 0.1821412395916803, + "grad_norm": 0.6859069824967786, + "learning_rate": 6.498184837113295e-07, + "loss": 1.6184, + "step": 2614 + }, + { + "epoch": 0.18221091871929762, + "grad_norm": 0.7132179337585111, + "learning_rate": 6.497811363936598e-07, + "loss": 1.4808, + "step": 2615 + }, + { + "epoch": 0.18228059784691494, + "grad_norm": 0.7409771501660511, + "learning_rate": 6.497437763871346e-07, + "loss": 1.4906, + "step": 2616 + }, + { + "epoch": 0.18235027697453227, + "grad_norm": 0.6944759248214315, + "learning_rate": 6.497064036935441e-07, + "loss": 1.5897, + "step": 2617 + }, + { + "epoch": 0.1824199561021496, + "grad_norm": 0.7015003873915269, + "learning_rate": 6.496690183146794e-07, + "loss": 1.509, + "step": 2618 + }, + { + "epoch": 0.1824896352297669, + "grad_norm": 0.6690613285222292, + "learning_rate": 6.496316202523318e-07, + "loss": 1.4293, + "step": 2619 + }, + { + "epoch": 0.18255931435738423, + "grad_norm": 0.749788872274255, + "learning_rate": 6.495942095082938e-07, + "loss": 1.5291, + "step": 2620 + }, + { + "epoch": 0.18262899348500156, + "grad_norm": 0.7473142725831187, + "learning_rate": 6.49556786084358e-07, + "loss": 1.6343, + "step": 2621 + }, + { + "epoch": 0.18269867261261888, + "grad_norm": 0.7249410808106185, + "learning_rate": 6.49519349982318e-07, + "loss": 1.4134, + "step": 2622 + }, + { + "epoch": 0.1827683517402362, + "grad_norm": 0.6833719220191706, + "learning_rate": 6.494819012039675e-07, + "loss": 1.5008, + "step": 2623 + }, + { + "epoch": 0.18283803086785352, + "grad_norm": 0.7862174459510464, + "learning_rate": 6.494444397511014e-07, + "loss": 1.6386, + "step": 2624 + }, + { + "epoch": 0.18290770999547085, + "grad_norm": 0.730100169454004, + "learning_rate": 6.49406965625515e-07, + "loss": 1.6254, + "step": 2625 + }, + { + "epoch": 0.18297738912308817, + "grad_norm": 0.7313275186062451, + "learning_rate": 6.493694788290038e-07, + "loss": 1.5602, + "step": 2626 + }, + { + "epoch": 0.1830470682507055, + "grad_norm": 0.8129828741881585, + "learning_rate": 6.493319793633646e-07, + "loss": 1.594, + "step": 2627 + }, + { + "epoch": 0.18311674737832281, + "grad_norm": 0.6898176815693812, + "learning_rate": 6.492944672303941e-07, + "loss": 1.634, + "step": 2628 + }, + { + "epoch": 0.18318642650594014, + "grad_norm": 0.708461442459301, + "learning_rate": 6.492569424318904e-07, + "loss": 1.5954, + "step": 2629 + }, + { + "epoch": 0.18325610563355746, + "grad_norm": 0.7568109034106955, + "learning_rate": 6.492194049696513e-07, + "loss": 1.5439, + "step": 2630 + }, + { + "epoch": 0.18332578476117478, + "grad_norm": 0.6771063848093974, + "learning_rate": 6.49181854845476e-07, + "loss": 1.5704, + "step": 2631 + }, + { + "epoch": 0.1833954638887921, + "grad_norm": 0.676580159279378, + "learning_rate": 6.491442920611637e-07, + "loss": 1.5426, + "step": 2632 + }, + { + "epoch": 0.18346514301640943, + "grad_norm": 0.6600845607953997, + "learning_rate": 6.491067166185148e-07, + "loss": 1.4584, + "step": 2633 + }, + { + "epoch": 0.18353482214402675, + "grad_norm": 0.724526501155734, + "learning_rate": 6.490691285193298e-07, + "loss": 1.4994, + "step": 2634 + }, + { + "epoch": 0.18360450127164407, + "grad_norm": 0.7652557824216829, + "learning_rate": 6.490315277654101e-07, + "loss": 1.5651, + "step": 2635 + }, + { + "epoch": 0.1836741803992614, + "grad_norm": 0.7187368483648745, + "learning_rate": 6.489939143585574e-07, + "loss": 1.5492, + "step": 2636 + }, + { + "epoch": 0.18374385952687872, + "grad_norm": 0.7394868496695424, + "learning_rate": 6.489562883005744e-07, + "loss": 1.7474, + "step": 2637 + }, + { + "epoch": 0.18381353865449604, + "grad_norm": 0.7707547565878867, + "learning_rate": 6.489186495932642e-07, + "loss": 1.6094, + "step": 2638 + }, + { + "epoch": 0.18388321778211336, + "grad_norm": 0.6817021323889921, + "learning_rate": 6.488809982384305e-07, + "loss": 1.5874, + "step": 2639 + }, + { + "epoch": 0.18395289690973068, + "grad_norm": 0.6530574602289403, + "learning_rate": 6.488433342378776e-07, + "loss": 1.3532, + "step": 2640 + }, + { + "epoch": 0.184022576037348, + "grad_norm": 0.6891115609332592, + "learning_rate": 6.488056575934104e-07, + "loss": 1.5033, + "step": 2641 + }, + { + "epoch": 0.18409225516496533, + "grad_norm": 0.7075654971214155, + "learning_rate": 6.487679683068348e-07, + "loss": 1.5582, + "step": 2642 + }, + { + "epoch": 0.18416193429258265, + "grad_norm": 0.6979922146733539, + "learning_rate": 6.487302663799565e-07, + "loss": 1.5725, + "step": 2643 + }, + { + "epoch": 0.18423161342019997, + "grad_norm": 0.7038692584176877, + "learning_rate": 6.486925518145822e-07, + "loss": 1.5976, + "step": 2644 + }, + { + "epoch": 0.1843012925478173, + "grad_norm": 0.7598408079706039, + "learning_rate": 6.486548246125197e-07, + "loss": 1.5574, + "step": 2645 + }, + { + "epoch": 0.18437097167543462, + "grad_norm": 0.6855757317942535, + "learning_rate": 6.486170847755767e-07, + "loss": 1.4375, + "step": 2646 + }, + { + "epoch": 0.18444065080305194, + "grad_norm": 0.7631668431699284, + "learning_rate": 6.485793323055616e-07, + "loss": 1.616, + "step": 2647 + }, + { + "epoch": 0.18451032993066926, + "grad_norm": 0.7368144191558978, + "learning_rate": 6.485415672042839e-07, + "loss": 1.5148, + "step": 2648 + }, + { + "epoch": 0.1845800090582866, + "grad_norm": 0.7685902842120601, + "learning_rate": 6.485037894735534e-07, + "loss": 1.7105, + "step": 2649 + }, + { + "epoch": 0.1846496881859039, + "grad_norm": 0.6976902801252264, + "learning_rate": 6.484659991151802e-07, + "loss": 1.4219, + "step": 2650 + }, + { + "epoch": 0.18471936731352123, + "grad_norm": 0.7580061537305252, + "learning_rate": 6.484281961309754e-07, + "loss": 1.5707, + "step": 2651 + }, + { + "epoch": 0.18478904644113855, + "grad_norm": 0.7500641059710291, + "learning_rate": 6.483903805227508e-07, + "loss": 1.5827, + "step": 2652 + }, + { + "epoch": 0.18485872556875588, + "grad_norm": 0.6649438106538588, + "learning_rate": 6.483525522923183e-07, + "loss": 1.4317, + "step": 2653 + }, + { + "epoch": 0.1849284046963732, + "grad_norm": 0.7351706654236744, + "learning_rate": 6.483147114414908e-07, + "loss": 1.5674, + "step": 2654 + }, + { + "epoch": 0.18499808382399052, + "grad_norm": 0.6680546391703736, + "learning_rate": 6.482768579720819e-07, + "loss": 1.5953, + "step": 2655 + }, + { + "epoch": 0.18506776295160784, + "grad_norm": 0.7247126042568591, + "learning_rate": 6.482389918859054e-07, + "loss": 1.4191, + "step": 2656 + }, + { + "epoch": 0.18513744207922517, + "grad_norm": 0.6911414934075633, + "learning_rate": 6.48201113184776e-07, + "loss": 1.438, + "step": 2657 + }, + { + "epoch": 0.1852071212068425, + "grad_norm": 0.734603021831186, + "learning_rate": 6.481632218705089e-07, + "loss": 1.5527, + "step": 2658 + }, + { + "epoch": 0.1852768003344598, + "grad_norm": 0.7504027564292364, + "learning_rate": 6.4812531794492e-07, + "loss": 1.6085, + "step": 2659 + }, + { + "epoch": 0.18534647946207713, + "grad_norm": 0.6875098502184053, + "learning_rate": 6.480874014098256e-07, + "loss": 1.4462, + "step": 2660 + }, + { + "epoch": 0.18541615858969446, + "grad_norm": 0.7557531183356848, + "learning_rate": 6.480494722670429e-07, + "loss": 1.5846, + "step": 2661 + }, + { + "epoch": 0.18548583771731178, + "grad_norm": 0.7337273902395194, + "learning_rate": 6.480115305183894e-07, + "loss": 1.4645, + "step": 2662 + }, + { + "epoch": 0.1855555168449291, + "grad_norm": 0.7159247755309575, + "learning_rate": 6.479735761656834e-07, + "loss": 1.5828, + "step": 2663 + }, + { + "epoch": 0.18562519597254643, + "grad_norm": 0.7390118659196949, + "learning_rate": 6.479356092107438e-07, + "loss": 1.62, + "step": 2664 + }, + { + "epoch": 0.18569487510016375, + "grad_norm": 0.758500507595704, + "learning_rate": 6.478976296553899e-07, + "loss": 1.5198, + "step": 2665 + }, + { + "epoch": 0.18576455422778107, + "grad_norm": 0.7302673339657927, + "learning_rate": 6.478596375014419e-07, + "loss": 1.6104, + "step": 2666 + }, + { + "epoch": 0.1858342333553984, + "grad_norm": 0.7771031453203399, + "learning_rate": 6.478216327507206e-07, + "loss": 1.6206, + "step": 2667 + }, + { + "epoch": 0.18590391248301572, + "grad_norm": 0.7260781373547577, + "learning_rate": 6.477836154050468e-07, + "loss": 1.5566, + "step": 2668 + }, + { + "epoch": 0.18597359161063304, + "grad_norm": 0.7006581746625344, + "learning_rate": 6.47745585466243e-07, + "loss": 1.3939, + "step": 2669 + }, + { + "epoch": 0.18604327073825036, + "grad_norm": 0.7801900637952093, + "learning_rate": 6.477075429361312e-07, + "loss": 1.5328, + "step": 2670 + }, + { + "epoch": 0.18611294986586768, + "grad_norm": 0.7423656847875956, + "learning_rate": 6.476694878165345e-07, + "loss": 1.6035, + "step": 2671 + }, + { + "epoch": 0.186182628993485, + "grad_norm": 0.7195463532032887, + "learning_rate": 6.476314201092768e-07, + "loss": 1.5965, + "step": 2672 + }, + { + "epoch": 0.18625230812110233, + "grad_norm": 0.7532581893821502, + "learning_rate": 6.475933398161822e-07, + "loss": 1.6735, + "step": 2673 + }, + { + "epoch": 0.18632198724871965, + "grad_norm": 0.7769211162227893, + "learning_rate": 6.475552469390758e-07, + "loss": 1.4756, + "step": 2674 + }, + { + "epoch": 0.18639166637633697, + "grad_norm": 0.7550947624847407, + "learning_rate": 6.475171414797828e-07, + "loss": 1.6108, + "step": 2675 + }, + { + "epoch": 0.1864613455039543, + "grad_norm": 0.6690296420086265, + "learning_rate": 6.474790234401296e-07, + "loss": 1.4959, + "step": 2676 + }, + { + "epoch": 0.18653102463157162, + "grad_norm": 0.6832451741796913, + "learning_rate": 6.474408928219426e-07, + "loss": 1.498, + "step": 2677 + }, + { + "epoch": 0.18660070375918894, + "grad_norm": 0.747393796002889, + "learning_rate": 6.474027496270493e-07, + "loss": 1.562, + "step": 2678 + }, + { + "epoch": 0.18667038288680626, + "grad_norm": 0.7611818415803452, + "learning_rate": 6.473645938572774e-07, + "loss": 1.6005, + "step": 2679 + }, + { + "epoch": 0.18674006201442359, + "grad_norm": 0.732431807713295, + "learning_rate": 6.473264255144558e-07, + "loss": 1.6562, + "step": 2680 + }, + { + "epoch": 0.1868097411420409, + "grad_norm": 0.8045845891090851, + "learning_rate": 6.472882446004132e-07, + "loss": 1.5858, + "step": 2681 + }, + { + "epoch": 0.18687942026965823, + "grad_norm": 0.6645879405992747, + "learning_rate": 6.472500511169794e-07, + "loss": 1.4494, + "step": 2682 + }, + { + "epoch": 0.18694909939727555, + "grad_norm": 0.7534293667028573, + "learning_rate": 6.472118450659848e-07, + "loss": 1.4723, + "step": 2683 + }, + { + "epoch": 0.18701877852489288, + "grad_norm": 0.7352604989735883, + "learning_rate": 6.471736264492604e-07, + "loss": 1.431, + "step": 2684 + }, + { + "epoch": 0.1870884576525102, + "grad_norm": 0.7933364996076561, + "learning_rate": 6.471353952686373e-07, + "loss": 1.524, + "step": 2685 + }, + { + "epoch": 0.18715813678012752, + "grad_norm": 0.7600850522766464, + "learning_rate": 6.470971515259481e-07, + "loss": 1.46, + "step": 2686 + }, + { + "epoch": 0.18722781590774484, + "grad_norm": 0.7938840091949416, + "learning_rate": 6.470588952230254e-07, + "loss": 1.5313, + "step": 2687 + }, + { + "epoch": 0.18729749503536217, + "grad_norm": 0.7195140084978836, + "learning_rate": 6.470206263617024e-07, + "loss": 1.4697, + "step": 2688 + }, + { + "epoch": 0.1873671741629795, + "grad_norm": 0.7005601442693249, + "learning_rate": 6.469823449438129e-07, + "loss": 1.5984, + "step": 2689 + }, + { + "epoch": 0.1874368532905968, + "grad_norm": 0.7700719550441382, + "learning_rate": 6.469440509711918e-07, + "loss": 1.5505, + "step": 2690 + }, + { + "epoch": 0.18750653241821413, + "grad_norm": 0.6955391533447876, + "learning_rate": 6.469057444456739e-07, + "loss": 1.5746, + "step": 2691 + }, + { + "epoch": 0.18757621154583146, + "grad_norm": 0.7533355959862432, + "learning_rate": 6.46867425369095e-07, + "loss": 1.4649, + "step": 2692 + }, + { + "epoch": 0.18764589067344878, + "grad_norm": 0.6861651730926349, + "learning_rate": 6.468290937432916e-07, + "loss": 1.556, + "step": 2693 + }, + { + "epoch": 0.1877155698010661, + "grad_norm": 0.6969686824642832, + "learning_rate": 6.467907495701004e-07, + "loss": 1.4597, + "step": 2694 + }, + { + "epoch": 0.18778524892868342, + "grad_norm": 0.7780056460087894, + "learning_rate": 6.467523928513592e-07, + "loss": 1.584, + "step": 2695 + }, + { + "epoch": 0.18785492805630075, + "grad_norm": 0.6971351165916029, + "learning_rate": 6.467140235889058e-07, + "loss": 1.3735, + "step": 2696 + }, + { + "epoch": 0.18792460718391807, + "grad_norm": 0.7640130727336821, + "learning_rate": 6.466756417845792e-07, + "loss": 1.6152, + "step": 2697 + }, + { + "epoch": 0.1879942863115354, + "grad_norm": 0.7174718505159229, + "learning_rate": 6.466372474402185e-07, + "loss": 1.5299, + "step": 2698 + }, + { + "epoch": 0.1880639654391527, + "grad_norm": 0.705437048563619, + "learning_rate": 6.465988405576638e-07, + "loss": 1.5884, + "step": 2699 + }, + { + "epoch": 0.18813364456677004, + "grad_norm": 0.8496788834128441, + "learning_rate": 6.465604211387557e-07, + "loss": 1.6326, + "step": 2700 + }, + { + "epoch": 0.18820332369438736, + "grad_norm": 0.7183308884189702, + "learning_rate": 6.465219891853353e-07, + "loss": 1.4403, + "step": 2701 + }, + { + "epoch": 0.18827300282200468, + "grad_norm": 0.7363655214283799, + "learning_rate": 6.464835446992441e-07, + "loss": 1.502, + "step": 2702 + }, + { + "epoch": 0.188342681949622, + "grad_norm": 0.7137203184429629, + "learning_rate": 6.464450876823248e-07, + "loss": 1.5203, + "step": 2703 + }, + { + "epoch": 0.18841236107723933, + "grad_norm": 0.7252300369350988, + "learning_rate": 6.464066181364201e-07, + "loss": 1.5046, + "step": 2704 + }, + { + "epoch": 0.18848204020485665, + "grad_norm": 0.7117919170248129, + "learning_rate": 6.463681360633735e-07, + "loss": 1.6556, + "step": 2705 + }, + { + "epoch": 0.18855171933247394, + "grad_norm": 0.720869879698167, + "learning_rate": 6.463296414650294e-07, + "loss": 1.5998, + "step": 2706 + }, + { + "epoch": 0.18862139846009127, + "grad_norm": 0.7449188955522009, + "learning_rate": 6.462911343432322e-07, + "loss": 1.4649, + "step": 2707 + }, + { + "epoch": 0.1886910775877086, + "grad_norm": 0.7439890979647477, + "learning_rate": 6.462526146998275e-07, + "loss": 1.6382, + "step": 2708 + }, + { + "epoch": 0.1887607567153259, + "grad_norm": 0.7126331244413916, + "learning_rate": 6.462140825366612e-07, + "loss": 1.5312, + "step": 2709 + }, + { + "epoch": 0.18883043584294323, + "grad_norm": 0.7735940649273119, + "learning_rate": 6.461755378555798e-07, + "loss": 1.5014, + "step": 2710 + }, + { + "epoch": 0.18890011497056056, + "grad_norm": 0.7440600101186636, + "learning_rate": 6.461369806584305e-07, + "loss": 1.6966, + "step": 2711 + }, + { + "epoch": 0.18896979409817788, + "grad_norm": 0.6759130137405256, + "learning_rate": 6.460984109470609e-07, + "loss": 1.5484, + "step": 2712 + }, + { + "epoch": 0.1890394732257952, + "grad_norm": 0.8264161895423117, + "learning_rate": 6.460598287233195e-07, + "loss": 1.5834, + "step": 2713 + }, + { + "epoch": 0.18910915235341252, + "grad_norm": 0.7491457334304705, + "learning_rate": 6.46021233989055e-07, + "loss": 1.5877, + "step": 2714 + }, + { + "epoch": 0.18917883148102985, + "grad_norm": 0.6864192440850518, + "learning_rate": 6.459826267461172e-07, + "loss": 1.5166, + "step": 2715 + }, + { + "epoch": 0.18924851060864717, + "grad_norm": 0.7259227539963868, + "learning_rate": 6.459440069963562e-07, + "loss": 1.5007, + "step": 2716 + }, + { + "epoch": 0.1893181897362645, + "grad_norm": 0.7092268754966882, + "learning_rate": 6.459053747416225e-07, + "loss": 1.5473, + "step": 2717 + }, + { + "epoch": 0.18938786886388181, + "grad_norm": 0.6882956952095184, + "learning_rate": 6.458667299837677e-07, + "loss": 1.5654, + "step": 2718 + }, + { + "epoch": 0.18945754799149914, + "grad_norm": 0.8355331625656581, + "learning_rate": 6.458280727246438e-07, + "loss": 1.5154, + "step": 2719 + }, + { + "epoch": 0.18952722711911646, + "grad_norm": 0.7352037316311917, + "learning_rate": 6.45789402966103e-07, + "loss": 1.6327, + "step": 2720 + }, + { + "epoch": 0.18959690624673378, + "grad_norm": 0.6941993455689607, + "learning_rate": 6.457507207099988e-07, + "loss": 1.4324, + "step": 2721 + }, + { + "epoch": 0.1896665853743511, + "grad_norm": 0.7649191465845473, + "learning_rate": 6.457120259581845e-07, + "loss": 1.6697, + "step": 2722 + }, + { + "epoch": 0.18973626450196843, + "grad_norm": 0.7466998683828353, + "learning_rate": 6.456733187125149e-07, + "loss": 1.659, + "step": 2723 + }, + { + "epoch": 0.18980594362958575, + "grad_norm": 0.7588182856390634, + "learning_rate": 6.456345989748445e-07, + "loss": 1.5306, + "step": 2724 + }, + { + "epoch": 0.18987562275720307, + "grad_norm": 0.6838571523104907, + "learning_rate": 6.455958667470293e-07, + "loss": 1.6308, + "step": 2725 + }, + { + "epoch": 0.1899453018848204, + "grad_norm": 0.7125291705245504, + "learning_rate": 6.455571220309251e-07, + "loss": 1.5757, + "step": 2726 + }, + { + "epoch": 0.19001498101243772, + "grad_norm": 0.7950404350336909, + "learning_rate": 6.455183648283886e-07, + "loss": 1.5175, + "step": 2727 + }, + { + "epoch": 0.19008466014005504, + "grad_norm": 0.6832770487165002, + "learning_rate": 6.454795951412773e-07, + "loss": 1.5996, + "step": 2728 + }, + { + "epoch": 0.19015433926767236, + "grad_norm": 0.723433406934566, + "learning_rate": 6.45440812971449e-07, + "loss": 1.5254, + "step": 2729 + }, + { + "epoch": 0.19022401839528968, + "grad_norm": 0.7054317859702796, + "learning_rate": 6.454020183207623e-07, + "loss": 1.5587, + "step": 2730 + }, + { + "epoch": 0.190293697522907, + "grad_norm": 0.7909451789471312, + "learning_rate": 6.453632111910763e-07, + "loss": 1.6194, + "step": 2731 + }, + { + "epoch": 0.19036337665052433, + "grad_norm": 0.6953562567380657, + "learning_rate": 6.453243915842507e-07, + "loss": 1.503, + "step": 2732 + }, + { + "epoch": 0.19043305577814165, + "grad_norm": 0.7637019576203848, + "learning_rate": 6.452855595021457e-07, + "loss": 1.6011, + "step": 2733 + }, + { + "epoch": 0.19050273490575897, + "grad_norm": 0.6822582597425052, + "learning_rate": 6.452467149466225e-07, + "loss": 1.3143, + "step": 2734 + }, + { + "epoch": 0.1905724140333763, + "grad_norm": 0.7178449079461839, + "learning_rate": 6.452078579195424e-07, + "loss": 1.5419, + "step": 2735 + }, + { + "epoch": 0.19064209316099362, + "grad_norm": 0.7138948499344089, + "learning_rate": 6.451689884227674e-07, + "loss": 1.4824, + "step": 2736 + }, + { + "epoch": 0.19071177228861094, + "grad_norm": 0.7322828001630005, + "learning_rate": 6.451301064581605e-07, + "loss": 1.506, + "step": 2737 + }, + { + "epoch": 0.19078145141622826, + "grad_norm": 0.811654935475173, + "learning_rate": 6.45091212027585e-07, + "loss": 1.5939, + "step": 2738 + }, + { + "epoch": 0.1908511305438456, + "grad_norm": 0.6802507343095897, + "learning_rate": 6.450523051329044e-07, + "loss": 1.5206, + "step": 2739 + }, + { + "epoch": 0.1909208096714629, + "grad_norm": 0.7181069784358606, + "learning_rate": 6.450133857759836e-07, + "loss": 1.566, + "step": 2740 + }, + { + "epoch": 0.19099048879908023, + "grad_norm": 0.7119050567279736, + "learning_rate": 6.449744539586875e-07, + "loss": 1.4723, + "step": 2741 + }, + { + "epoch": 0.19106016792669755, + "grad_norm": 0.7359612711039888, + "learning_rate": 6.449355096828818e-07, + "loss": 1.4862, + "step": 2742 + }, + { + "epoch": 0.19112984705431488, + "grad_norm": 0.7097328511153883, + "learning_rate": 6.44896552950433e-07, + "loss": 1.5173, + "step": 2743 + }, + { + "epoch": 0.1911995261819322, + "grad_norm": 0.6929533047668857, + "learning_rate": 6.448575837632079e-07, + "loss": 1.5577, + "step": 2744 + }, + { + "epoch": 0.19126920530954952, + "grad_norm": 0.7041347408340864, + "learning_rate": 6.448186021230737e-07, + "loss": 1.452, + "step": 2745 + }, + { + "epoch": 0.19133888443716685, + "grad_norm": 0.72942054200097, + "learning_rate": 6.447796080318986e-07, + "loss": 1.5466, + "step": 2746 + }, + { + "epoch": 0.19140856356478417, + "grad_norm": 0.7194334581281807, + "learning_rate": 6.447406014915515e-07, + "loss": 1.5971, + "step": 2747 + }, + { + "epoch": 0.1914782426924015, + "grad_norm": 0.746922395656889, + "learning_rate": 6.447015825039016e-07, + "loss": 1.6671, + "step": 2748 + }, + { + "epoch": 0.1915479218200188, + "grad_norm": 0.6766416878379267, + "learning_rate": 6.446625510708187e-07, + "loss": 1.5329, + "step": 2749 + }, + { + "epoch": 0.19161760094763614, + "grad_norm": 0.7661740967695121, + "learning_rate": 6.446235071941732e-07, + "loss": 1.6282, + "step": 2750 + }, + { + "epoch": 0.19168728007525346, + "grad_norm": 0.7270021181227755, + "learning_rate": 6.445844508758363e-07, + "loss": 1.4649, + "step": 2751 + }, + { + "epoch": 0.19175695920287078, + "grad_norm": 0.6904466633241558, + "learning_rate": 6.445453821176795e-07, + "loss": 1.4696, + "step": 2752 + }, + { + "epoch": 0.1918266383304881, + "grad_norm": 0.7594226179740771, + "learning_rate": 6.445063009215751e-07, + "loss": 1.5229, + "step": 2753 + }, + { + "epoch": 0.19189631745810543, + "grad_norm": 0.7340481003476917, + "learning_rate": 6.444672072893962e-07, + "loss": 1.5785, + "step": 2754 + }, + { + "epoch": 0.19196599658572275, + "grad_norm": 0.7553905406477419, + "learning_rate": 6.444281012230159e-07, + "loss": 1.5664, + "step": 2755 + }, + { + "epoch": 0.19203567571334007, + "grad_norm": 0.7126604908145795, + "learning_rate": 6.443889827243085e-07, + "loss": 1.5051, + "step": 2756 + }, + { + "epoch": 0.1921053548409574, + "grad_norm": 0.7123761884060794, + "learning_rate": 6.443498517951485e-07, + "loss": 1.4743, + "step": 2757 + }, + { + "epoch": 0.19217503396857472, + "grad_norm": 0.7877452122261493, + "learning_rate": 6.443107084374112e-07, + "loss": 1.5403, + "step": 2758 + }, + { + "epoch": 0.19224471309619204, + "grad_norm": 0.6537712332432835, + "learning_rate": 6.442715526529724e-07, + "loss": 1.3929, + "step": 2759 + }, + { + "epoch": 0.19231439222380936, + "grad_norm": 0.6982635432356034, + "learning_rate": 6.442323844437085e-07, + "loss": 1.5458, + "step": 2760 + }, + { + "epoch": 0.19238407135142668, + "grad_norm": 0.7007628799980677, + "learning_rate": 6.441932038114964e-07, + "loss": 1.4499, + "step": 2761 + }, + { + "epoch": 0.192453750479044, + "grad_norm": 0.6998289345209359, + "learning_rate": 6.441540107582141e-07, + "loss": 1.527, + "step": 2762 + }, + { + "epoch": 0.19252342960666133, + "grad_norm": 0.7224707964363176, + "learning_rate": 6.441148052857394e-07, + "loss": 1.5791, + "step": 2763 + }, + { + "epoch": 0.19259310873427865, + "grad_norm": 0.7549588551015194, + "learning_rate": 6.440755873959513e-07, + "loss": 1.5293, + "step": 2764 + }, + { + "epoch": 0.19266278786189597, + "grad_norm": 0.8271235962565269, + "learning_rate": 6.440363570907294e-07, + "loss": 1.6131, + "step": 2765 + }, + { + "epoch": 0.1927324669895133, + "grad_norm": 0.7332975981376336, + "learning_rate": 6.439971143719531e-07, + "loss": 1.5622, + "step": 2766 + }, + { + "epoch": 0.19280214611713062, + "grad_norm": 0.7431066568250002, + "learning_rate": 6.439578592415036e-07, + "loss": 1.4984, + "step": 2767 + }, + { + "epoch": 0.19287182524474794, + "grad_norm": 0.7062334001251804, + "learning_rate": 6.43918591701262e-07, + "loss": 1.5469, + "step": 2768 + }, + { + "epoch": 0.19294150437236526, + "grad_norm": 0.8259805932604753, + "learning_rate": 6.438793117531097e-07, + "loss": 1.813, + "step": 2769 + }, + { + "epoch": 0.19301118349998259, + "grad_norm": 0.7265022203713082, + "learning_rate": 6.438400193989292e-07, + "loss": 1.5044, + "step": 2770 + }, + { + "epoch": 0.1930808626275999, + "grad_norm": 0.7800710904673711, + "learning_rate": 6.438007146406037e-07, + "loss": 1.5255, + "step": 2771 + }, + { + "epoch": 0.19315054175521723, + "grad_norm": 0.6915373563261974, + "learning_rate": 6.437613974800168e-07, + "loss": 1.4791, + "step": 2772 + }, + { + "epoch": 0.19322022088283455, + "grad_norm": 0.7455382855726259, + "learning_rate": 6.437220679190524e-07, + "loss": 1.6197, + "step": 2773 + }, + { + "epoch": 0.19328990001045188, + "grad_norm": 0.741388638904269, + "learning_rate": 6.436827259595954e-07, + "loss": 1.5326, + "step": 2774 + }, + { + "epoch": 0.1933595791380692, + "grad_norm": 0.7274532124482767, + "learning_rate": 6.436433716035309e-07, + "loss": 1.6042, + "step": 2775 + }, + { + "epoch": 0.19342925826568652, + "grad_norm": 0.6613153079163728, + "learning_rate": 6.436040048527453e-07, + "loss": 1.423, + "step": 2776 + }, + { + "epoch": 0.19349893739330384, + "grad_norm": 0.7526717406315007, + "learning_rate": 6.435646257091248e-07, + "loss": 1.6477, + "step": 2777 + }, + { + "epoch": 0.19356861652092117, + "grad_norm": 0.7457469971419062, + "learning_rate": 6.435252341745566e-07, + "loss": 1.7472, + "step": 2778 + }, + { + "epoch": 0.1936382956485385, + "grad_norm": 0.7283337615791947, + "learning_rate": 6.434858302509284e-07, + "loss": 1.6787, + "step": 2779 + }, + { + "epoch": 0.1937079747761558, + "grad_norm": 0.7479235036434875, + "learning_rate": 6.434464139401287e-07, + "loss": 1.8808, + "step": 2780 + }, + { + "epoch": 0.19377765390377313, + "grad_norm": 0.6914895910578951, + "learning_rate": 6.434069852440461e-07, + "loss": 1.5673, + "step": 2781 + }, + { + "epoch": 0.19384733303139046, + "grad_norm": 0.7254723360609027, + "learning_rate": 6.433675441645703e-07, + "loss": 1.539, + "step": 2782 + }, + { + "epoch": 0.19391701215900778, + "grad_norm": 0.6833639219823902, + "learning_rate": 6.433280907035914e-07, + "loss": 1.5052, + "step": 2783 + }, + { + "epoch": 0.1939866912866251, + "grad_norm": 0.7420464590441822, + "learning_rate": 6.432886248630001e-07, + "loss": 1.5594, + "step": 2784 + }, + { + "epoch": 0.19405637041424242, + "grad_norm": 0.6652408829333618, + "learning_rate": 6.432491466446876e-07, + "loss": 1.2722, + "step": 2785 + }, + { + "epoch": 0.19412604954185975, + "grad_norm": 0.6626300147201509, + "learning_rate": 6.432096560505458e-07, + "loss": 1.398, + "step": 2786 + }, + { + "epoch": 0.19419572866947707, + "grad_norm": 0.7465600044204658, + "learning_rate": 6.431701530824671e-07, + "loss": 1.5446, + "step": 2787 + }, + { + "epoch": 0.1942654077970944, + "grad_norm": 0.7559501299835806, + "learning_rate": 6.431306377423449e-07, + "loss": 1.4106, + "step": 2788 + }, + { + "epoch": 0.19433508692471171, + "grad_norm": 0.7012209472341381, + "learning_rate": 6.430911100320723e-07, + "loss": 1.492, + "step": 2789 + }, + { + "epoch": 0.19440476605232904, + "grad_norm": 0.711451679800713, + "learning_rate": 6.430515699535441e-07, + "loss": 1.516, + "step": 2790 + }, + { + "epoch": 0.19447444517994636, + "grad_norm": 0.7340907534836132, + "learning_rate": 6.430120175086548e-07, + "loss": 1.5429, + "step": 2791 + }, + { + "epoch": 0.19454412430756368, + "grad_norm": 0.7246947540430131, + "learning_rate": 6.429724526993e-07, + "loss": 1.4757, + "step": 2792 + }, + { + "epoch": 0.194613803435181, + "grad_norm": 0.6721144443775302, + "learning_rate": 6.429328755273755e-07, + "loss": 1.4574, + "step": 2793 + }, + { + "epoch": 0.19468348256279833, + "grad_norm": 0.7162298227034134, + "learning_rate": 6.428932859947782e-07, + "loss": 1.5135, + "step": 2794 + }, + { + "epoch": 0.19475316169041565, + "grad_norm": 0.6931706305166699, + "learning_rate": 6.428536841034052e-07, + "loss": 1.6258, + "step": 2795 + }, + { + "epoch": 0.19482284081803297, + "grad_norm": 0.6696857466893444, + "learning_rate": 6.428140698551542e-07, + "loss": 1.4967, + "step": 2796 + }, + { + "epoch": 0.19489251994565027, + "grad_norm": 0.7079042834337671, + "learning_rate": 6.427744432519237e-07, + "loss": 1.4456, + "step": 2797 + }, + { + "epoch": 0.1949621990732676, + "grad_norm": 0.7021521523597029, + "learning_rate": 6.427348042956129e-07, + "loss": 1.6123, + "step": 2798 + }, + { + "epoch": 0.1950318782008849, + "grad_norm": 0.7125268105966331, + "learning_rate": 6.426951529881209e-07, + "loss": 1.4722, + "step": 2799 + }, + { + "epoch": 0.19510155732850223, + "grad_norm": 0.6802108240355292, + "learning_rate": 6.426554893313482e-07, + "loss": 1.5581, + "step": 2800 + }, + { + "epoch": 0.19517123645611956, + "grad_norm": 0.7984301225935575, + "learning_rate": 6.426158133271956e-07, + "loss": 1.7121, + "step": 2801 + }, + { + "epoch": 0.19524091558373688, + "grad_norm": 0.6879133160196618, + "learning_rate": 6.425761249775642e-07, + "loss": 1.5569, + "step": 2802 + }, + { + "epoch": 0.1953105947113542, + "grad_norm": 0.6831021054413613, + "learning_rate": 6.425364242843561e-07, + "loss": 1.6133, + "step": 2803 + }, + { + "epoch": 0.19538027383897152, + "grad_norm": 0.7194943179719568, + "learning_rate": 6.42496711249474e-07, + "loss": 1.6279, + "step": 2804 + }, + { + "epoch": 0.19544995296658885, + "grad_norm": 0.7155004699065798, + "learning_rate": 6.424569858748207e-07, + "loss": 1.5022, + "step": 2805 + }, + { + "epoch": 0.19551963209420617, + "grad_norm": 0.7372121801210776, + "learning_rate": 6.424172481623002e-07, + "loss": 1.4252, + "step": 2806 + }, + { + "epoch": 0.1955893112218235, + "grad_norm": 0.7147709147935137, + "learning_rate": 6.423774981138167e-07, + "loss": 1.6929, + "step": 2807 + }, + { + "epoch": 0.19565899034944081, + "grad_norm": 0.7087472475940669, + "learning_rate": 6.423377357312749e-07, + "loss": 1.6249, + "step": 2808 + }, + { + "epoch": 0.19572866947705814, + "grad_norm": 0.6918451314684043, + "learning_rate": 6.422979610165808e-07, + "loss": 1.5191, + "step": 2809 + }, + { + "epoch": 0.19579834860467546, + "grad_norm": 0.7084086537536417, + "learning_rate": 6.422581739716399e-07, + "loss": 1.5604, + "step": 2810 + }, + { + "epoch": 0.19586802773229278, + "grad_norm": 0.703091479351792, + "learning_rate": 6.422183745983593e-07, + "loss": 1.6192, + "step": 2811 + }, + { + "epoch": 0.1959377068599101, + "grad_norm": 0.7999940834598073, + "learning_rate": 6.421785628986463e-07, + "loss": 1.6106, + "step": 2812 + }, + { + "epoch": 0.19600738598752743, + "grad_norm": 0.7439364790535018, + "learning_rate": 6.421387388744083e-07, + "loss": 1.4405, + "step": 2813 + }, + { + "epoch": 0.19607706511514475, + "grad_norm": 0.7334064002022052, + "learning_rate": 6.420989025275542e-07, + "loss": 1.2867, + "step": 2814 + }, + { + "epoch": 0.19614674424276207, + "grad_norm": 0.746628498475421, + "learning_rate": 6.420590538599928e-07, + "loss": 1.4626, + "step": 2815 + }, + { + "epoch": 0.1962164233703794, + "grad_norm": 0.6954573088788314, + "learning_rate": 6.420191928736339e-07, + "loss": 1.5406, + "step": 2816 + }, + { + "epoch": 0.19628610249799672, + "grad_norm": 0.6996811523016951, + "learning_rate": 6.419793195703875e-07, + "loss": 1.6512, + "step": 2817 + }, + { + "epoch": 0.19635578162561404, + "grad_norm": 0.7488194125821913, + "learning_rate": 6.419394339521647e-07, + "loss": 1.5551, + "step": 2818 + }, + { + "epoch": 0.19642546075323136, + "grad_norm": 0.6858185427092187, + "learning_rate": 6.418995360208766e-07, + "loss": 1.5719, + "step": 2819 + }, + { + "epoch": 0.19649513988084868, + "grad_norm": 0.8012233152006635, + "learning_rate": 6.418596257784354e-07, + "loss": 1.5205, + "step": 2820 + }, + { + "epoch": 0.196564819008466, + "grad_norm": 0.6869950460759762, + "learning_rate": 6.418197032267536e-07, + "loss": 1.4701, + "step": 2821 + }, + { + "epoch": 0.19663449813608333, + "grad_norm": 0.7558716905890024, + "learning_rate": 6.417797683677444e-07, + "loss": 1.5402, + "step": 2822 + }, + { + "epoch": 0.19670417726370065, + "grad_norm": 0.6739914935874416, + "learning_rate": 6.417398212033214e-07, + "loss": 1.5758, + "step": 2823 + }, + { + "epoch": 0.19677385639131798, + "grad_norm": 0.7447417175089804, + "learning_rate": 6.416998617353993e-07, + "loss": 1.6096, + "step": 2824 + }, + { + "epoch": 0.1968435355189353, + "grad_norm": 0.6751254016745756, + "learning_rate": 6.416598899658926e-07, + "loss": 1.4143, + "step": 2825 + }, + { + "epoch": 0.19691321464655262, + "grad_norm": 0.7045687553583755, + "learning_rate": 6.416199058967173e-07, + "loss": 1.6254, + "step": 2826 + }, + { + "epoch": 0.19698289377416994, + "grad_norm": 0.7201577315390923, + "learning_rate": 6.41579909529789e-07, + "loss": 1.561, + "step": 2827 + }, + { + "epoch": 0.19705257290178727, + "grad_norm": 0.6772138874740977, + "learning_rate": 6.41539900867025e-07, + "loss": 1.6012, + "step": 2828 + }, + { + "epoch": 0.1971222520294046, + "grad_norm": 0.7547835224780833, + "learning_rate": 6.414998799103421e-07, + "loss": 1.6242, + "step": 2829 + }, + { + "epoch": 0.1971919311570219, + "grad_norm": 0.7656065190946523, + "learning_rate": 6.414598466616585e-07, + "loss": 1.6035, + "step": 2830 + }, + { + "epoch": 0.19726161028463923, + "grad_norm": 0.7244976383852213, + "learning_rate": 6.414198011228923e-07, + "loss": 1.6974, + "step": 2831 + }, + { + "epoch": 0.19733128941225656, + "grad_norm": 0.7259803282047949, + "learning_rate": 6.413797432959631e-07, + "loss": 1.5899, + "step": 2832 + }, + { + "epoch": 0.19740096853987388, + "grad_norm": 0.7657140131144465, + "learning_rate": 6.413396731827901e-07, + "loss": 1.5469, + "step": 2833 + }, + { + "epoch": 0.1974706476674912, + "grad_norm": 0.7295398215507435, + "learning_rate": 6.412995907852937e-07, + "loss": 1.5622, + "step": 2834 + }, + { + "epoch": 0.19754032679510852, + "grad_norm": 0.7318433019876475, + "learning_rate": 6.412594961053948e-07, + "loss": 1.5309, + "step": 2835 + }, + { + "epoch": 0.19761000592272585, + "grad_norm": 0.71192020542836, + "learning_rate": 6.412193891450147e-07, + "loss": 1.5897, + "step": 2836 + }, + { + "epoch": 0.19767968505034317, + "grad_norm": 0.7021056662939148, + "learning_rate": 6.411792699060755e-07, + "loss": 1.5287, + "step": 2837 + }, + { + "epoch": 0.1977493641779605, + "grad_norm": 0.7681891994041536, + "learning_rate": 6.411391383904998e-07, + "loss": 1.5876, + "step": 2838 + }, + { + "epoch": 0.1978190433055778, + "grad_norm": 0.7493583117634097, + "learning_rate": 6.410989946002107e-07, + "loss": 1.5165, + "step": 2839 + }, + { + "epoch": 0.19788872243319514, + "grad_norm": 0.7378067482423463, + "learning_rate": 6.41058838537132e-07, + "loss": 1.4552, + "step": 2840 + }, + { + "epoch": 0.19795840156081246, + "grad_norm": 0.7445427788569985, + "learning_rate": 6.41018670203188e-07, + "loss": 1.5561, + "step": 2841 + }, + { + "epoch": 0.19802808068842978, + "grad_norm": 0.7622629977423278, + "learning_rate": 6.409784896003039e-07, + "loss": 1.5894, + "step": 2842 + }, + { + "epoch": 0.1980977598160471, + "grad_norm": 0.7665002701824735, + "learning_rate": 6.40938296730405e-07, + "loss": 1.5411, + "step": 2843 + }, + { + "epoch": 0.19816743894366443, + "grad_norm": 0.7254647373761564, + "learning_rate": 6.408980915954175e-07, + "loss": 1.464, + "step": 2844 + }, + { + "epoch": 0.19823711807128175, + "grad_norm": 0.7271343977692023, + "learning_rate": 6.40857874197268e-07, + "loss": 1.6614, + "step": 2845 + }, + { + "epoch": 0.19830679719889907, + "grad_norm": 0.7139000862151373, + "learning_rate": 6.40817644537884e-07, + "loss": 1.5599, + "step": 2846 + }, + { + "epoch": 0.1983764763265164, + "grad_norm": 0.6681638182102779, + "learning_rate": 6.407774026191933e-07, + "loss": 1.6665, + "step": 2847 + }, + { + "epoch": 0.19844615545413372, + "grad_norm": 0.7868727777360315, + "learning_rate": 6.407371484431243e-07, + "loss": 1.5568, + "step": 2848 + }, + { + "epoch": 0.19851583458175104, + "grad_norm": 0.7553355394120074, + "learning_rate": 6.406968820116061e-07, + "loss": 1.551, + "step": 2849 + }, + { + "epoch": 0.19858551370936836, + "grad_norm": 0.745667391752033, + "learning_rate": 6.406566033265686e-07, + "loss": 1.467, + "step": 2850 + }, + { + "epoch": 0.19865519283698568, + "grad_norm": 0.7214889502095606, + "learning_rate": 6.406163123899416e-07, + "loss": 1.5626, + "step": 2851 + }, + { + "epoch": 0.198724871964603, + "grad_norm": 0.7039722794357991, + "learning_rate": 6.405760092036561e-07, + "loss": 1.4811, + "step": 2852 + }, + { + "epoch": 0.19879455109222033, + "grad_norm": 0.7028228796727578, + "learning_rate": 6.405356937696437e-07, + "loss": 1.5365, + "step": 2853 + }, + { + "epoch": 0.19886423021983765, + "grad_norm": 0.7327098572778575, + "learning_rate": 6.404953660898362e-07, + "loss": 1.6272, + "step": 2854 + }, + { + "epoch": 0.19893390934745497, + "grad_norm": 0.6640798875637417, + "learning_rate": 6.404550261661662e-07, + "loss": 1.4853, + "step": 2855 + }, + { + "epoch": 0.1990035884750723, + "grad_norm": 0.6993347936293075, + "learning_rate": 6.40414674000567e-07, + "loss": 1.565, + "step": 2856 + }, + { + "epoch": 0.19907326760268962, + "grad_norm": 0.68104277668191, + "learning_rate": 6.403743095949722e-07, + "loss": 1.4548, + "step": 2857 + }, + { + "epoch": 0.19914294673030694, + "grad_norm": 0.7617610054585428, + "learning_rate": 6.403339329513161e-07, + "loss": 1.6688, + "step": 2858 + }, + { + "epoch": 0.19921262585792426, + "grad_norm": 0.7131057683123566, + "learning_rate": 6.402935440715339e-07, + "loss": 1.6218, + "step": 2859 + }, + { + "epoch": 0.1992823049855416, + "grad_norm": 0.6968176358706246, + "learning_rate": 6.402531429575609e-07, + "loss": 1.4128, + "step": 2860 + }, + { + "epoch": 0.1993519841131589, + "grad_norm": 0.718108124628976, + "learning_rate": 6.402127296113334e-07, + "loss": 1.6234, + "step": 2861 + }, + { + "epoch": 0.19942166324077623, + "grad_norm": 0.7445450879925423, + "learning_rate": 6.401723040347878e-07, + "loss": 1.4594, + "step": 2862 + }, + { + "epoch": 0.19949134236839355, + "grad_norm": 0.7261122364370047, + "learning_rate": 6.401318662298615e-07, + "loss": 1.5517, + "step": 2863 + }, + { + "epoch": 0.19956102149601088, + "grad_norm": 0.7220852520629888, + "learning_rate": 6.400914161984925e-07, + "loss": 1.5947, + "step": 2864 + }, + { + "epoch": 0.1996307006236282, + "grad_norm": 0.6652206452519076, + "learning_rate": 6.400509539426191e-07, + "loss": 1.4174, + "step": 2865 + }, + { + "epoch": 0.19970037975124552, + "grad_norm": 0.7005410182147795, + "learning_rate": 6.400104794641804e-07, + "loss": 1.6145, + "step": 2866 + }, + { + "epoch": 0.19977005887886284, + "grad_norm": 0.7164367875663481, + "learning_rate": 6.39969992765116e-07, + "loss": 1.6245, + "step": 2867 + }, + { + "epoch": 0.19983973800648017, + "grad_norm": 0.7140554642769725, + "learning_rate": 6.399294938473661e-07, + "loss": 1.5329, + "step": 2868 + }, + { + "epoch": 0.1999094171340975, + "grad_norm": 0.7436557780926554, + "learning_rate": 6.398889827128715e-07, + "loss": 1.6935, + "step": 2869 + }, + { + "epoch": 0.1999790962617148, + "grad_norm": 0.7231366816956215, + "learning_rate": 6.398484593635735e-07, + "loss": 1.5485, + "step": 2870 + }, + { + "epoch": 0.20004877538933213, + "grad_norm": 0.66648456054266, + "learning_rate": 6.398079238014141e-07, + "loss": 1.4717, + "step": 2871 + }, + { + "epoch": 0.20011845451694946, + "grad_norm": 0.7575185945561026, + "learning_rate": 6.39767376028336e-07, + "loss": 1.6266, + "step": 2872 + }, + { + "epoch": 0.20018813364456678, + "grad_norm": 0.7215300676555968, + "learning_rate": 6.397268160462822e-07, + "loss": 1.5363, + "step": 2873 + }, + { + "epoch": 0.2002578127721841, + "grad_norm": 0.7612344619035825, + "learning_rate": 6.396862438571965e-07, + "loss": 1.5018, + "step": 2874 + }, + { + "epoch": 0.20032749189980142, + "grad_norm": 0.7581568365524771, + "learning_rate": 6.39645659463023e-07, + "loss": 1.5225, + "step": 2875 + }, + { + "epoch": 0.20039717102741875, + "grad_norm": 0.7129849329155351, + "learning_rate": 6.396050628657068e-07, + "loss": 1.5566, + "step": 2876 + }, + { + "epoch": 0.20046685015503607, + "grad_norm": 0.7277241962059409, + "learning_rate": 6.395644540671932e-07, + "loss": 1.5693, + "step": 2877 + }, + { + "epoch": 0.2005365292826534, + "grad_norm": 0.7268033481810099, + "learning_rate": 6.395238330694284e-07, + "loss": 1.7033, + "step": 2878 + }, + { + "epoch": 0.20060620841027071, + "grad_norm": 0.7578464136236305, + "learning_rate": 6.39483199874359e-07, + "loss": 1.5682, + "step": 2879 + }, + { + "epoch": 0.20067588753788804, + "grad_norm": 0.685464342009214, + "learning_rate": 6.394425544839323e-07, + "loss": 1.5303, + "step": 2880 + }, + { + "epoch": 0.20074556666550536, + "grad_norm": 0.7182341780133301, + "learning_rate": 6.394018969000958e-07, + "loss": 1.592, + "step": 2881 + }, + { + "epoch": 0.20081524579312268, + "grad_norm": 0.6970355774318064, + "learning_rate": 6.393612271247984e-07, + "loss": 1.5237, + "step": 2882 + }, + { + "epoch": 0.20088492492074, + "grad_norm": 0.692437523420572, + "learning_rate": 6.393205451599887e-07, + "loss": 1.5745, + "step": 2883 + }, + { + "epoch": 0.20095460404835733, + "grad_norm": 0.7421810031357134, + "learning_rate": 6.392798510076162e-07, + "loss": 1.5122, + "step": 2884 + }, + { + "epoch": 0.20102428317597465, + "grad_norm": 0.7247718274256905, + "learning_rate": 6.392391446696315e-07, + "loss": 1.5325, + "step": 2885 + }, + { + "epoch": 0.20109396230359197, + "grad_norm": 0.7015610226696939, + "learning_rate": 6.391984261479848e-07, + "loss": 1.4387, + "step": 2886 + }, + { + "epoch": 0.2011636414312093, + "grad_norm": 0.7136527685788008, + "learning_rate": 6.391576954446278e-07, + "loss": 1.5067, + "step": 2887 + }, + { + "epoch": 0.20123332055882662, + "grad_norm": 0.751014147824683, + "learning_rate": 6.391169525615121e-07, + "loss": 1.6104, + "step": 2888 + }, + { + "epoch": 0.2013029996864439, + "grad_norm": 0.8270177555880811, + "learning_rate": 6.390761975005905e-07, + "loss": 1.6917, + "step": 2889 + }, + { + "epoch": 0.20137267881406123, + "grad_norm": 0.7686620792228451, + "learning_rate": 6.39035430263816e-07, + "loss": 1.4402, + "step": 2890 + }, + { + "epoch": 0.20144235794167856, + "grad_norm": 0.6852734847807493, + "learning_rate": 6.38994650853142e-07, + "loss": 1.4847, + "step": 2891 + }, + { + "epoch": 0.20151203706929588, + "grad_norm": 0.6861463442143995, + "learning_rate": 6.389538592705229e-07, + "loss": 1.56, + "step": 2892 + }, + { + "epoch": 0.2015817161969132, + "grad_norm": 0.7185857414585152, + "learning_rate": 6.389130555179134e-07, + "loss": 1.4896, + "step": 2893 + }, + { + "epoch": 0.20165139532453052, + "grad_norm": 0.7038209840042945, + "learning_rate": 6.388722395972692e-07, + "loss": 1.4935, + "step": 2894 + }, + { + "epoch": 0.20172107445214785, + "grad_norm": 0.7107114427154652, + "learning_rate": 6.38831411510546e-07, + "loss": 1.6031, + "step": 2895 + }, + { + "epoch": 0.20179075357976517, + "grad_norm": 0.6991671503850788, + "learning_rate": 6.387905712597004e-07, + "loss": 1.5549, + "step": 2896 + }, + { + "epoch": 0.2018604327073825, + "grad_norm": 0.7530770913699526, + "learning_rate": 6.387497188466897e-07, + "loss": 1.4468, + "step": 2897 + }, + { + "epoch": 0.20193011183499981, + "grad_norm": 0.6670243465782024, + "learning_rate": 6.387088542734715e-07, + "loss": 1.6758, + "step": 2898 + }, + { + "epoch": 0.20199979096261714, + "grad_norm": 0.7092271431291487, + "learning_rate": 6.386679775420042e-07, + "loss": 1.4933, + "step": 2899 + }, + { + "epoch": 0.20206947009023446, + "grad_norm": 0.736826431597258, + "learning_rate": 6.386270886542466e-07, + "loss": 1.4696, + "step": 2900 + }, + { + "epoch": 0.20213914921785178, + "grad_norm": 0.6884689205263212, + "learning_rate": 6.385861876121582e-07, + "loss": 1.5714, + "step": 2901 + }, + { + "epoch": 0.2022088283454691, + "grad_norm": 0.7191280188616926, + "learning_rate": 6.385452744176991e-07, + "loss": 1.4575, + "step": 2902 + }, + { + "epoch": 0.20227850747308643, + "grad_norm": 0.7246993662960771, + "learning_rate": 6.385043490728299e-07, + "loss": 1.5546, + "step": 2903 + }, + { + "epoch": 0.20234818660070375, + "grad_norm": 0.7135160908956892, + "learning_rate": 6.384634115795119e-07, + "loss": 1.5166, + "step": 2904 + }, + { + "epoch": 0.20241786572832107, + "grad_norm": 0.7142107711582788, + "learning_rate": 6.384224619397069e-07, + "loss": 1.4995, + "step": 2905 + }, + { + "epoch": 0.2024875448559384, + "grad_norm": 0.7530747506999542, + "learning_rate": 6.383815001553772e-07, + "loss": 1.5028, + "step": 2906 + }, + { + "epoch": 0.20255722398355572, + "grad_norm": 0.6930300478756766, + "learning_rate": 6.383405262284859e-07, + "loss": 1.6004, + "step": 2907 + }, + { + "epoch": 0.20262690311117304, + "grad_norm": 0.8030841056649302, + "learning_rate": 6.382995401609964e-07, + "loss": 1.5417, + "step": 2908 + }, + { + "epoch": 0.20269658223879036, + "grad_norm": 0.7682765721959441, + "learning_rate": 6.38258541954873e-07, + "loss": 1.6121, + "step": 2909 + }, + { + "epoch": 0.20276626136640769, + "grad_norm": 0.7125907867109845, + "learning_rate": 6.382175316120802e-07, + "loss": 1.4901, + "step": 2910 + }, + { + "epoch": 0.202835940494025, + "grad_norm": 0.7415556985743339, + "learning_rate": 6.381765091345836e-07, + "loss": 1.624, + "step": 2911 + }, + { + "epoch": 0.20290561962164233, + "grad_norm": 0.7215852980031595, + "learning_rate": 6.381354745243489e-07, + "loss": 1.5852, + "step": 2912 + }, + { + "epoch": 0.20297529874925965, + "grad_norm": 0.6972105029569049, + "learning_rate": 6.380944277833425e-07, + "loss": 1.4699, + "step": 2913 + }, + { + "epoch": 0.20304497787687698, + "grad_norm": 0.7509877613104287, + "learning_rate": 6.380533689135316e-07, + "loss": 1.5867, + "step": 2914 + }, + { + "epoch": 0.2031146570044943, + "grad_norm": 0.7736493844250907, + "learning_rate": 6.380122979168837e-07, + "loss": 1.5804, + "step": 2915 + }, + { + "epoch": 0.20318433613211162, + "grad_norm": 0.689680312040028, + "learning_rate": 6.379712147953671e-07, + "loss": 1.5172, + "step": 2916 + }, + { + "epoch": 0.20325401525972894, + "grad_norm": 0.7474879041731053, + "learning_rate": 6.379301195509505e-07, + "loss": 1.4659, + "step": 2917 + }, + { + "epoch": 0.20332369438734627, + "grad_norm": 0.7394318317131775, + "learning_rate": 6.378890121856034e-07, + "loss": 1.5584, + "step": 2918 + }, + { + "epoch": 0.2033933735149636, + "grad_norm": 0.6932945340619577, + "learning_rate": 6.378478927012956e-07, + "loss": 1.5079, + "step": 2919 + }, + { + "epoch": 0.2034630526425809, + "grad_norm": 0.7306806137581724, + "learning_rate": 6.378067610999978e-07, + "loss": 1.6069, + "step": 2920 + }, + { + "epoch": 0.20353273177019823, + "grad_norm": 0.7651741778475234, + "learning_rate": 6.377656173836808e-07, + "loss": 1.4645, + "step": 2921 + }, + { + "epoch": 0.20360241089781556, + "grad_norm": 0.7220245915728065, + "learning_rate": 6.377244615543167e-07, + "loss": 1.5917, + "step": 2922 + }, + { + "epoch": 0.20367209002543288, + "grad_norm": 0.7149824274459259, + "learning_rate": 6.376832936138774e-07, + "loss": 1.6134, + "step": 2923 + }, + { + "epoch": 0.2037417691530502, + "grad_norm": 0.7180373040033633, + "learning_rate": 6.37642113564336e-07, + "loss": 1.4994, + "step": 2924 + }, + { + "epoch": 0.20381144828066752, + "grad_norm": 0.7100774209110662, + "learning_rate": 6.376009214076658e-07, + "loss": 1.4923, + "step": 2925 + }, + { + "epoch": 0.20388112740828485, + "grad_norm": 0.6672838981463166, + "learning_rate": 6.37559717145841e-07, + "loss": 1.5192, + "step": 2926 + }, + { + "epoch": 0.20395080653590217, + "grad_norm": 0.7585267274723554, + "learning_rate": 6.37518500780836e-07, + "loss": 1.556, + "step": 2927 + }, + { + "epoch": 0.2040204856635195, + "grad_norm": 0.7175807603218348, + "learning_rate": 6.374772723146259e-07, + "loss": 1.592, + "step": 2928 + }, + { + "epoch": 0.2040901647911368, + "grad_norm": 0.7037496395350076, + "learning_rate": 6.374360317491867e-07, + "loss": 1.5354, + "step": 2929 + }, + { + "epoch": 0.20415984391875414, + "grad_norm": 0.691830573824489, + "learning_rate": 6.373947790864947e-07, + "loss": 1.5469, + "step": 2930 + }, + { + "epoch": 0.20422952304637146, + "grad_norm": 0.7167616804294645, + "learning_rate": 6.373535143285267e-07, + "loss": 1.6573, + "step": 2931 + }, + { + "epoch": 0.20429920217398878, + "grad_norm": 0.6779235031573106, + "learning_rate": 6.373122374772601e-07, + "loss": 1.5974, + "step": 2932 + }, + { + "epoch": 0.2043688813016061, + "grad_norm": 0.6919311558080987, + "learning_rate": 6.372709485346731e-07, + "loss": 1.5194, + "step": 2933 + }, + { + "epoch": 0.20443856042922343, + "grad_norm": 0.6804788103429456, + "learning_rate": 6.372296475027444e-07, + "loss": 1.4618, + "step": 2934 + }, + { + "epoch": 0.20450823955684075, + "grad_norm": 0.7378684110679562, + "learning_rate": 6.371883343834532e-07, + "loss": 1.591, + "step": 2935 + }, + { + "epoch": 0.20457791868445807, + "grad_norm": 0.7033761641976377, + "learning_rate": 6.371470091787792e-07, + "loss": 1.4276, + "step": 2936 + }, + { + "epoch": 0.2046475978120754, + "grad_norm": 0.7226159322335646, + "learning_rate": 6.371056718907029e-07, + "loss": 1.4797, + "step": 2937 + }, + { + "epoch": 0.20471727693969272, + "grad_norm": 0.7733144725121075, + "learning_rate": 6.370643225212054e-07, + "loss": 1.6505, + "step": 2938 + }, + { + "epoch": 0.20478695606731004, + "grad_norm": 0.7425310543293385, + "learning_rate": 6.370229610722679e-07, + "loss": 1.5771, + "step": 2939 + }, + { + "epoch": 0.20485663519492736, + "grad_norm": 0.6828119499786712, + "learning_rate": 6.369815875458727e-07, + "loss": 1.5115, + "step": 2940 + }, + { + "epoch": 0.20492631432254468, + "grad_norm": 0.7025329013795072, + "learning_rate": 6.369402019440027e-07, + "loss": 1.5258, + "step": 2941 + }, + { + "epoch": 0.204995993450162, + "grad_norm": 0.7426863118585209, + "learning_rate": 6.368988042686408e-07, + "loss": 1.4511, + "step": 2942 + }, + { + "epoch": 0.20506567257777933, + "grad_norm": 0.6975590020208378, + "learning_rate": 6.368573945217712e-07, + "loss": 1.5915, + "step": 2943 + }, + { + "epoch": 0.20513535170539665, + "grad_norm": 0.6555378771055111, + "learning_rate": 6.368159727053781e-07, + "loss": 1.53, + "step": 2944 + }, + { + "epoch": 0.20520503083301397, + "grad_norm": 0.730334550632792, + "learning_rate": 6.367745388214467e-07, + "loss": 1.5767, + "step": 2945 + }, + { + "epoch": 0.2052747099606313, + "grad_norm": 0.6796967601212938, + "learning_rate": 6.367330928719625e-07, + "loss": 1.5181, + "step": 2946 + }, + { + "epoch": 0.20534438908824862, + "grad_norm": 0.7077642966530233, + "learning_rate": 6.366916348589116e-07, + "loss": 1.3541, + "step": 2947 + }, + { + "epoch": 0.20541406821586594, + "grad_norm": 0.739817028672939, + "learning_rate": 6.36650164784281e-07, + "loss": 1.5237, + "step": 2948 + }, + { + "epoch": 0.20548374734348326, + "grad_norm": 0.7232713294038303, + "learning_rate": 6.366086826500578e-07, + "loss": 1.642, + "step": 2949 + }, + { + "epoch": 0.2055534264711006, + "grad_norm": 0.7529785742217403, + "learning_rate": 6.3656718845823e-07, + "loss": 1.5518, + "step": 2950 + }, + { + "epoch": 0.2056231055987179, + "grad_norm": 0.7295660194953272, + "learning_rate": 6.365256822107859e-07, + "loss": 1.4517, + "step": 2951 + }, + { + "epoch": 0.20569278472633523, + "grad_norm": 0.6964765639169348, + "learning_rate": 6.36484163909715e-07, + "loss": 1.6233, + "step": 2952 + }, + { + "epoch": 0.20576246385395255, + "grad_norm": 0.710909846668233, + "learning_rate": 6.364426335570066e-07, + "loss": 1.6418, + "step": 2953 + }, + { + "epoch": 0.20583214298156988, + "grad_norm": 0.7652461920615087, + "learning_rate": 6.364010911546508e-07, + "loss": 1.5322, + "step": 2954 + }, + { + "epoch": 0.2059018221091872, + "grad_norm": 0.7045875156326461, + "learning_rate": 6.363595367046389e-07, + "loss": 1.5305, + "step": 2955 + }, + { + "epoch": 0.20597150123680452, + "grad_norm": 0.7563366210538794, + "learning_rate": 6.363179702089618e-07, + "loss": 1.5893, + "step": 2956 + }, + { + "epoch": 0.20604118036442184, + "grad_norm": 0.7511868496946549, + "learning_rate": 6.362763916696117e-07, + "loss": 1.732, + "step": 2957 + }, + { + "epoch": 0.20611085949203917, + "grad_norm": 0.6697718587186111, + "learning_rate": 6.362348010885809e-07, + "loss": 1.5072, + "step": 2958 + }, + { + "epoch": 0.2061805386196565, + "grad_norm": 0.7530888567410823, + "learning_rate": 6.361931984678628e-07, + "loss": 1.5611, + "step": 2959 + }, + { + "epoch": 0.2062502177472738, + "grad_norm": 0.7185438842375362, + "learning_rate": 6.361515838094509e-07, + "loss": 1.6223, + "step": 2960 + }, + { + "epoch": 0.20631989687489113, + "grad_norm": 0.7036427853275619, + "learning_rate": 6.361099571153395e-07, + "loss": 1.4914, + "step": 2961 + }, + { + "epoch": 0.20638957600250846, + "grad_norm": 0.7408387503191232, + "learning_rate": 6.360683183875235e-07, + "loss": 1.5372, + "step": 2962 + }, + { + "epoch": 0.20645925513012578, + "grad_norm": 0.7178825080999842, + "learning_rate": 6.360266676279981e-07, + "loss": 1.6065, + "step": 2963 + }, + { + "epoch": 0.2065289342577431, + "grad_norm": 0.7429861155708916, + "learning_rate": 6.359850048387594e-07, + "loss": 1.4552, + "step": 2964 + }, + { + "epoch": 0.20659861338536042, + "grad_norm": 0.7202664843056981, + "learning_rate": 6.359433300218042e-07, + "loss": 1.594, + "step": 2965 + }, + { + "epoch": 0.20666829251297775, + "grad_norm": 0.7222321118467498, + "learning_rate": 6.359016431791294e-07, + "loss": 1.5255, + "step": 2966 + }, + { + "epoch": 0.20673797164059507, + "grad_norm": 0.734344864117076, + "learning_rate": 6.358599443127328e-07, + "loss": 1.5508, + "step": 2967 + }, + { + "epoch": 0.2068076507682124, + "grad_norm": 0.7206494532948653, + "learning_rate": 6.358182334246125e-07, + "loss": 1.4947, + "step": 2968 + }, + { + "epoch": 0.20687732989582971, + "grad_norm": 0.7140087905352821, + "learning_rate": 6.357765105167676e-07, + "loss": 1.5102, + "step": 2969 + }, + { + "epoch": 0.20694700902344704, + "grad_norm": 0.7852221694206425, + "learning_rate": 6.357347755911976e-07, + "loss": 1.6091, + "step": 2970 + }, + { + "epoch": 0.20701668815106436, + "grad_norm": 0.7048526642376749, + "learning_rate": 6.356930286499024e-07, + "loss": 1.5422, + "step": 2971 + }, + { + "epoch": 0.20708636727868168, + "grad_norm": 0.8007350555344261, + "learning_rate": 6.356512696948825e-07, + "loss": 1.5874, + "step": 2972 + }, + { + "epoch": 0.207156046406299, + "grad_norm": 0.7615620997332188, + "learning_rate": 6.356094987281391e-07, + "loss": 1.4809, + "step": 2973 + }, + { + "epoch": 0.20722572553391633, + "grad_norm": 0.7434804810145779, + "learning_rate": 6.355677157516743e-07, + "loss": 1.5758, + "step": 2974 + }, + { + "epoch": 0.20729540466153365, + "grad_norm": 0.7457304333301299, + "learning_rate": 6.355259207674899e-07, + "loss": 1.4783, + "step": 2975 + }, + { + "epoch": 0.20736508378915097, + "grad_norm": 0.706990856124059, + "learning_rate": 6.354841137775891e-07, + "loss": 1.4879, + "step": 2976 + }, + { + "epoch": 0.2074347629167683, + "grad_norm": 0.6955401431087637, + "learning_rate": 6.354422947839755e-07, + "loss": 1.5454, + "step": 2977 + }, + { + "epoch": 0.20750444204438562, + "grad_norm": 0.7384175680602987, + "learning_rate": 6.354004637886528e-07, + "loss": 1.5092, + "step": 2978 + }, + { + "epoch": 0.20757412117200294, + "grad_norm": 0.685234971809416, + "learning_rate": 6.35358620793626e-07, + "loss": 1.4753, + "step": 2979 + }, + { + "epoch": 0.20764380029962026, + "grad_norm": 0.70711394306406, + "learning_rate": 6.353167658009e-07, + "loss": 1.4555, + "step": 2980 + }, + { + "epoch": 0.20771347942723756, + "grad_norm": 0.7962597663635689, + "learning_rate": 6.352748988124805e-07, + "loss": 1.4569, + "step": 2981 + }, + { + "epoch": 0.20778315855485488, + "grad_norm": 0.6868913171560058, + "learning_rate": 6.352330198303742e-07, + "loss": 1.4446, + "step": 2982 + }, + { + "epoch": 0.2078528376824722, + "grad_norm": 0.7735566971642617, + "learning_rate": 6.35191128856588e-07, + "loss": 1.4883, + "step": 2983 + }, + { + "epoch": 0.20792251681008952, + "grad_norm": 0.7079499524333197, + "learning_rate": 6.351492258931292e-07, + "loss": 1.5655, + "step": 2984 + }, + { + "epoch": 0.20799219593770685, + "grad_norm": 0.7512965571925899, + "learning_rate": 6.351073109420059e-07, + "loss": 1.4857, + "step": 2985 + }, + { + "epoch": 0.20806187506532417, + "grad_norm": 0.7695609293964266, + "learning_rate": 6.350653840052269e-07, + "loss": 1.5553, + "step": 2986 + }, + { + "epoch": 0.2081315541929415, + "grad_norm": 0.7377077462933005, + "learning_rate": 6.350234450848013e-07, + "loss": 1.5265, + "step": 2987 + }, + { + "epoch": 0.20820123332055882, + "grad_norm": 0.7476217154283284, + "learning_rate": 6.349814941827387e-07, + "loss": 1.6797, + "step": 2988 + }, + { + "epoch": 0.20827091244817614, + "grad_norm": 0.7456407850803368, + "learning_rate": 6.3493953130105e-07, + "loss": 1.4848, + "step": 2989 + }, + { + "epoch": 0.20834059157579346, + "grad_norm": 0.7976077275272218, + "learning_rate": 6.348975564417456e-07, + "loss": 1.583, + "step": 2990 + }, + { + "epoch": 0.20841027070341078, + "grad_norm": 0.6623779094432293, + "learning_rate": 6.348555696068374e-07, + "loss": 1.4389, + "step": 2991 + }, + { + "epoch": 0.2084799498310281, + "grad_norm": 0.7602196162699404, + "learning_rate": 6.348135707983374e-07, + "loss": 1.6182, + "step": 2992 + }, + { + "epoch": 0.20854962895864543, + "grad_norm": 0.7172420272508857, + "learning_rate": 6.347715600182582e-07, + "loss": 1.5597, + "step": 2993 + }, + { + "epoch": 0.20861930808626275, + "grad_norm": 0.7716121956224239, + "learning_rate": 6.347295372686129e-07, + "loss": 1.574, + "step": 2994 + }, + { + "epoch": 0.20868898721388007, + "grad_norm": 0.6863157372578493, + "learning_rate": 6.346875025514157e-07, + "loss": 1.5149, + "step": 2995 + }, + { + "epoch": 0.2087586663414974, + "grad_norm": 0.7010956595448394, + "learning_rate": 6.346454558686807e-07, + "loss": 1.491, + "step": 2996 + }, + { + "epoch": 0.20882834546911472, + "grad_norm": 0.7242536207410032, + "learning_rate": 6.34603397222423e-07, + "loss": 1.4427, + "step": 2997 + }, + { + "epoch": 0.20889802459673204, + "grad_norm": 0.7040783271824286, + "learning_rate": 6.345613266146581e-07, + "loss": 1.4716, + "step": 2998 + }, + { + "epoch": 0.20896770372434936, + "grad_norm": 0.7624020600820641, + "learning_rate": 6.34519244047402e-07, + "loss": 1.7078, + "step": 2999 + }, + { + "epoch": 0.20903738285196669, + "grad_norm": 0.7175278942824543, + "learning_rate": 6.344771495226715e-07, + "loss": 1.5841, + "step": 3000 + }, + { + "epoch": 0.209107061979584, + "grad_norm": 0.7130038274068573, + "learning_rate": 6.344350430424838e-07, + "loss": 1.5632, + "step": 3001 + }, + { + "epoch": 0.20917674110720133, + "grad_norm": 0.7453698321614705, + "learning_rate": 6.343929246088567e-07, + "loss": 1.6019, + "step": 3002 + }, + { + "epoch": 0.20924642023481865, + "grad_norm": 0.7151850979286338, + "learning_rate": 6.343507942238088e-07, + "loss": 1.6559, + "step": 3003 + }, + { + "epoch": 0.20931609936243598, + "grad_norm": 0.6936345747851341, + "learning_rate": 6.343086518893588e-07, + "loss": 1.4821, + "step": 3004 + }, + { + "epoch": 0.2093857784900533, + "grad_norm": 0.7956254510612235, + "learning_rate": 6.342664976075265e-07, + "loss": 1.5881, + "step": 3005 + }, + { + "epoch": 0.20945545761767062, + "grad_norm": 0.7429121066816823, + "learning_rate": 6.342243313803317e-07, + "loss": 1.5633, + "step": 3006 + }, + { + "epoch": 0.20952513674528794, + "grad_norm": 0.8037190129898167, + "learning_rate": 6.341821532097956e-07, + "loss": 1.5909, + "step": 3007 + }, + { + "epoch": 0.20959481587290527, + "grad_norm": 0.6477652783679634, + "learning_rate": 6.34139963097939e-07, + "loss": 1.488, + "step": 3008 + }, + { + "epoch": 0.2096644950005226, + "grad_norm": 0.7000979494433776, + "learning_rate": 6.340977610467839e-07, + "loss": 1.4087, + "step": 3009 + }, + { + "epoch": 0.2097341741281399, + "grad_norm": 0.7648185849056666, + "learning_rate": 6.340555470583526e-07, + "loss": 1.5636, + "step": 3010 + }, + { + "epoch": 0.20980385325575723, + "grad_norm": 0.6881245272099351, + "learning_rate": 6.340133211346684e-07, + "loss": 1.4254, + "step": 3011 + }, + { + "epoch": 0.20987353238337456, + "grad_norm": 0.7170054857436351, + "learning_rate": 6.339710832777545e-07, + "loss": 1.5242, + "step": 3012 + }, + { + "epoch": 0.20994321151099188, + "grad_norm": 0.7091403344071238, + "learning_rate": 6.339288334896352e-07, + "loss": 1.5227, + "step": 3013 + }, + { + "epoch": 0.2100128906386092, + "grad_norm": 0.7188824298263891, + "learning_rate": 6.338865717723351e-07, + "loss": 1.5106, + "step": 3014 + }, + { + "epoch": 0.21008256976622652, + "grad_norm": 0.7123102173208373, + "learning_rate": 6.338442981278796e-07, + "loss": 1.6579, + "step": 3015 + }, + { + "epoch": 0.21015224889384385, + "grad_norm": 0.7421057665043513, + "learning_rate": 6.338020125582944e-07, + "loss": 1.5459, + "step": 3016 + }, + { + "epoch": 0.21022192802146117, + "grad_norm": 0.726883902227082, + "learning_rate": 6.33759715065606e-07, + "loss": 1.493, + "step": 3017 + }, + { + "epoch": 0.2102916071490785, + "grad_norm": 0.7538909742546808, + "learning_rate": 6.337174056518413e-07, + "loss": 1.6548, + "step": 3018 + }, + { + "epoch": 0.2103612862766958, + "grad_norm": 0.6874902631466682, + "learning_rate": 6.336750843190281e-07, + "loss": 1.5101, + "step": 3019 + }, + { + "epoch": 0.21043096540431314, + "grad_norm": 0.6991743258217981, + "learning_rate": 6.336327510691941e-07, + "loss": 1.4517, + "step": 3020 + }, + { + "epoch": 0.21050064453193046, + "grad_norm": 0.7216212073939949, + "learning_rate": 6.335904059043684e-07, + "loss": 1.5649, + "step": 3021 + }, + { + "epoch": 0.21057032365954778, + "grad_norm": 0.6953455603901181, + "learning_rate": 6.335480488265801e-07, + "loss": 1.5648, + "step": 3022 + }, + { + "epoch": 0.2106400027871651, + "grad_norm": 0.7269944245240746, + "learning_rate": 6.33505679837859e-07, + "loss": 1.5551, + "step": 3023 + }, + { + "epoch": 0.21070968191478243, + "grad_norm": 0.7521226273898199, + "learning_rate": 6.334632989402356e-07, + "loss": 1.5345, + "step": 3024 + }, + { + "epoch": 0.21077936104239975, + "grad_norm": 0.7288849333564393, + "learning_rate": 6.334209061357407e-07, + "loss": 1.5542, + "step": 3025 + }, + { + "epoch": 0.21084904017001707, + "grad_norm": 0.723684896541463, + "learning_rate": 6.333785014264061e-07, + "loss": 1.4942, + "step": 3026 + }, + { + "epoch": 0.2109187192976344, + "grad_norm": 0.8354732057230474, + "learning_rate": 6.333360848142637e-07, + "loss": 1.5765, + "step": 3027 + }, + { + "epoch": 0.21098839842525172, + "grad_norm": 0.7266486063179814, + "learning_rate": 6.332936563013465e-07, + "loss": 1.5522, + "step": 3028 + }, + { + "epoch": 0.21105807755286904, + "grad_norm": 0.7620578829901353, + "learning_rate": 6.332512158896873e-07, + "loss": 1.6742, + "step": 3029 + }, + { + "epoch": 0.21112775668048636, + "grad_norm": 0.6453483203498609, + "learning_rate": 6.332087635813202e-07, + "loss": 1.4459, + "step": 3030 + }, + { + "epoch": 0.21119743580810368, + "grad_norm": 0.7671250472225403, + "learning_rate": 6.331662993782797e-07, + "loss": 1.608, + "step": 3031 + }, + { + "epoch": 0.211267114935721, + "grad_norm": 0.7297600261040783, + "learning_rate": 6.331238232826006e-07, + "loss": 1.5218, + "step": 3032 + }, + { + "epoch": 0.21133679406333833, + "grad_norm": 0.6976240359205483, + "learning_rate": 6.330813352963186e-07, + "loss": 1.5028, + "step": 3033 + }, + { + "epoch": 0.21140647319095565, + "grad_norm": 0.7366466923541604, + "learning_rate": 6.330388354214697e-07, + "loss": 1.5963, + "step": 3034 + }, + { + "epoch": 0.21147615231857297, + "grad_norm": 0.743045908625905, + "learning_rate": 6.329963236600905e-07, + "loss": 1.5822, + "step": 3035 + }, + { + "epoch": 0.2115458314461903, + "grad_norm": 0.7694559232108855, + "learning_rate": 6.329538000142183e-07, + "loss": 1.5745, + "step": 3036 + }, + { + "epoch": 0.21161551057380762, + "grad_norm": 0.7136032068936781, + "learning_rate": 6.329112644858911e-07, + "loss": 1.6823, + "step": 3037 + }, + { + "epoch": 0.21168518970142494, + "grad_norm": 0.7209064494933916, + "learning_rate": 6.328687170771472e-07, + "loss": 1.5415, + "step": 3038 + }, + { + "epoch": 0.21175486882904226, + "grad_norm": 0.7601206403956654, + "learning_rate": 6.328261577900253e-07, + "loss": 1.531, + "step": 3039 + }, + { + "epoch": 0.2118245479566596, + "grad_norm": 0.6959998472222233, + "learning_rate": 6.327835866265652e-07, + "loss": 1.4477, + "step": 3040 + }, + { + "epoch": 0.2118942270842769, + "grad_norm": 0.6953710184416528, + "learning_rate": 6.327410035888071e-07, + "loss": 1.58, + "step": 3041 + }, + { + "epoch": 0.21196390621189423, + "grad_norm": 0.6943533580699583, + "learning_rate": 6.326984086787913e-07, + "loss": 1.5223, + "step": 3042 + }, + { + "epoch": 0.21203358533951155, + "grad_norm": 0.7408013816517577, + "learning_rate": 6.326558018985593e-07, + "loss": 1.5385, + "step": 3043 + }, + { + "epoch": 0.21210326446712888, + "grad_norm": 0.7128173593393061, + "learning_rate": 6.32613183250153e-07, + "loss": 1.5008, + "step": 3044 + }, + { + "epoch": 0.2121729435947462, + "grad_norm": 0.7578937862434474, + "learning_rate": 6.325705527356144e-07, + "loss": 1.5314, + "step": 3045 + }, + { + "epoch": 0.21224262272236352, + "grad_norm": 0.7242230650402078, + "learning_rate": 6.325279103569868e-07, + "loss": 1.5105, + "step": 3046 + }, + { + "epoch": 0.21231230184998084, + "grad_norm": 0.7271388530049001, + "learning_rate": 6.324852561163136e-07, + "loss": 1.56, + "step": 3047 + }, + { + "epoch": 0.21238198097759817, + "grad_norm": 0.7251757959446895, + "learning_rate": 6.324425900156387e-07, + "loss": 1.6189, + "step": 3048 + }, + { + "epoch": 0.2124516601052155, + "grad_norm": 0.6949230910117429, + "learning_rate": 6.32399912057007e-07, + "loss": 1.4936, + "step": 3049 + }, + { + "epoch": 0.2125213392328328, + "grad_norm": 0.7083185802548545, + "learning_rate": 6.323572222424636e-07, + "loss": 1.6193, + "step": 3050 + }, + { + "epoch": 0.21259101836045013, + "grad_norm": 0.7188362586038636, + "learning_rate": 6.323145205740543e-07, + "loss": 1.5033, + "step": 3051 + }, + { + "epoch": 0.21266069748806746, + "grad_norm": 0.6807201669919434, + "learning_rate": 6.322718070538254e-07, + "loss": 1.4721, + "step": 3052 + }, + { + "epoch": 0.21273037661568478, + "grad_norm": 0.7290543207491141, + "learning_rate": 6.32229081683824e-07, + "loss": 1.464, + "step": 3053 + }, + { + "epoch": 0.2128000557433021, + "grad_norm": 0.7049188321594511, + "learning_rate": 6.321863444660972e-07, + "loss": 1.5481, + "step": 3054 + }, + { + "epoch": 0.21286973487091942, + "grad_norm": 0.7531590172268701, + "learning_rate": 6.321435954026935e-07, + "loss": 1.5502, + "step": 3055 + }, + { + "epoch": 0.21293941399853675, + "grad_norm": 0.7204791442591438, + "learning_rate": 6.321008344956612e-07, + "loss": 1.5385, + "step": 3056 + }, + { + "epoch": 0.21300909312615407, + "grad_norm": 0.766737235051107, + "learning_rate": 6.320580617470498e-07, + "loss": 1.5535, + "step": 3057 + }, + { + "epoch": 0.2130787722537714, + "grad_norm": 0.6949576898262453, + "learning_rate": 6.320152771589089e-07, + "loss": 1.5787, + "step": 3058 + }, + { + "epoch": 0.21314845138138871, + "grad_norm": 0.7854280947188981, + "learning_rate": 6.319724807332886e-07, + "loss": 1.5751, + "step": 3059 + }, + { + "epoch": 0.21321813050900604, + "grad_norm": 0.7310738000268694, + "learning_rate": 6.319296724722401e-07, + "loss": 1.4912, + "step": 3060 + }, + { + "epoch": 0.21328780963662336, + "grad_norm": 0.710144969632082, + "learning_rate": 6.318868523778147e-07, + "loss": 1.501, + "step": 3061 + }, + { + "epoch": 0.21335748876424068, + "grad_norm": 0.7624755707474923, + "learning_rate": 6.318440204520646e-07, + "loss": 1.5925, + "step": 3062 + }, + { + "epoch": 0.213427167891858, + "grad_norm": 0.6779707671850002, + "learning_rate": 6.318011766970422e-07, + "loss": 1.566, + "step": 3063 + }, + { + "epoch": 0.21349684701947533, + "grad_norm": 0.7282347315793222, + "learning_rate": 6.317583211148007e-07, + "loss": 1.637, + "step": 3064 + }, + { + "epoch": 0.21356652614709265, + "grad_norm": 0.6777857214243174, + "learning_rate": 6.31715453707394e-07, + "loss": 1.5141, + "step": 3065 + }, + { + "epoch": 0.21363620527470997, + "grad_norm": 0.7409112879146345, + "learning_rate": 6.316725744768762e-07, + "loss": 1.5923, + "step": 3066 + }, + { + "epoch": 0.2137058844023273, + "grad_norm": 0.7024204744147935, + "learning_rate": 6.316296834253022e-07, + "loss": 1.5464, + "step": 3067 + }, + { + "epoch": 0.21377556352994462, + "grad_norm": 0.6877856023309812, + "learning_rate": 6.315867805547275e-07, + "loss": 1.5618, + "step": 3068 + }, + { + "epoch": 0.21384524265756194, + "grad_norm": 0.7212486356758446, + "learning_rate": 6.31543865867208e-07, + "loss": 1.5316, + "step": 3069 + }, + { + "epoch": 0.21391492178517926, + "grad_norm": 0.776152740751751, + "learning_rate": 6.315009393648003e-07, + "loss": 1.6051, + "step": 3070 + }, + { + "epoch": 0.21398460091279659, + "grad_norm": 0.7071509959023832, + "learning_rate": 6.314580010495615e-07, + "loss": 1.5317, + "step": 3071 + }, + { + "epoch": 0.2140542800404139, + "grad_norm": 0.6884549731762428, + "learning_rate": 6.314150509235494e-07, + "loss": 1.4914, + "step": 3072 + }, + { + "epoch": 0.2141239591680312, + "grad_norm": 0.7408015065712721, + "learning_rate": 6.313720889888221e-07, + "loss": 1.5193, + "step": 3073 + }, + { + "epoch": 0.21419363829564853, + "grad_norm": 0.7518489520556682, + "learning_rate": 6.313291152474385e-07, + "loss": 1.5984, + "step": 3074 + }, + { + "epoch": 0.21426331742326585, + "grad_norm": 0.6931740419432002, + "learning_rate": 6.312861297014581e-07, + "loss": 1.5873, + "step": 3075 + }, + { + "epoch": 0.21433299655088317, + "grad_norm": 0.7460738164111779, + "learning_rate": 6.312431323529407e-07, + "loss": 1.5159, + "step": 3076 + }, + { + "epoch": 0.2144026756785005, + "grad_norm": 0.7066855472272406, + "learning_rate": 6.312001232039468e-07, + "loss": 1.4636, + "step": 3077 + }, + { + "epoch": 0.21447235480611782, + "grad_norm": 0.7111733820215189, + "learning_rate": 6.311571022565376e-07, + "loss": 1.6072, + "step": 3078 + }, + { + "epoch": 0.21454203393373514, + "grad_norm": 0.7146743692003119, + "learning_rate": 6.311140695127748e-07, + "loss": 1.5797, + "step": 3079 + }, + { + "epoch": 0.21461171306135246, + "grad_norm": 0.7428280015586619, + "learning_rate": 6.310710249747204e-07, + "loss": 1.5731, + "step": 3080 + }, + { + "epoch": 0.21468139218896978, + "grad_norm": 0.7142132227243223, + "learning_rate": 6.310279686444374e-07, + "loss": 1.5772, + "step": 3081 + }, + { + "epoch": 0.2147510713165871, + "grad_norm": 0.7367688211537811, + "learning_rate": 6.30984900523989e-07, + "loss": 1.5302, + "step": 3082 + }, + { + "epoch": 0.21482075044420443, + "grad_norm": 0.7051161284853323, + "learning_rate": 6.309418206154392e-07, + "loss": 1.3608, + "step": 3083 + }, + { + "epoch": 0.21489042957182175, + "grad_norm": 0.6561827725117855, + "learning_rate": 6.308987289208523e-07, + "loss": 1.442, + "step": 3084 + }, + { + "epoch": 0.21496010869943907, + "grad_norm": 0.7060953414362706, + "learning_rate": 6.308556254422937e-07, + "loss": 1.5811, + "step": 3085 + }, + { + "epoch": 0.2150297878270564, + "grad_norm": 0.712986152298592, + "learning_rate": 6.308125101818287e-07, + "loss": 1.4907, + "step": 3086 + }, + { + "epoch": 0.21509946695467372, + "grad_norm": 0.7064297350621125, + "learning_rate": 6.307693831415236e-07, + "loss": 1.4074, + "step": 3087 + }, + { + "epoch": 0.21516914608229104, + "grad_norm": 0.6978571997252455, + "learning_rate": 6.307262443234451e-07, + "loss": 1.5763, + "step": 3088 + }, + { + "epoch": 0.21523882520990836, + "grad_norm": 0.7204214153527864, + "learning_rate": 6.306830937296605e-07, + "loss": 1.5953, + "step": 3089 + }, + { + "epoch": 0.21530850433752569, + "grad_norm": 0.7251712825238252, + "learning_rate": 6.306399313622376e-07, + "loss": 1.4922, + "step": 3090 + }, + { + "epoch": 0.215378183465143, + "grad_norm": 0.789317477528938, + "learning_rate": 6.30596757223245e-07, + "loss": 1.5323, + "step": 3091 + }, + { + "epoch": 0.21544786259276033, + "grad_norm": 0.7018313808348219, + "learning_rate": 6.305535713147515e-07, + "loss": 1.5525, + "step": 3092 + }, + { + "epoch": 0.21551754172037765, + "grad_norm": 0.6641796971823418, + "learning_rate": 6.305103736388268e-07, + "loss": 1.4296, + "step": 3093 + }, + { + "epoch": 0.21558722084799498, + "grad_norm": 0.7333376762555791, + "learning_rate": 6.30467164197541e-07, + "loss": 1.5683, + "step": 3094 + }, + { + "epoch": 0.2156568999756123, + "grad_norm": 0.715162682097904, + "learning_rate": 6.304239429929647e-07, + "loss": 1.5447, + "step": 3095 + }, + { + "epoch": 0.21572657910322962, + "grad_norm": 0.705125764957599, + "learning_rate": 6.303807100271694e-07, + "loss": 1.5388, + "step": 3096 + }, + { + "epoch": 0.21579625823084694, + "grad_norm": 0.6932910886448272, + "learning_rate": 6.303374653022264e-07, + "loss": 1.3299, + "step": 3097 + }, + { + "epoch": 0.21586593735846427, + "grad_norm": 0.7285862258034365, + "learning_rate": 6.302942088202086e-07, + "loss": 1.5698, + "step": 3098 + }, + { + "epoch": 0.2159356164860816, + "grad_norm": 0.6525649404815276, + "learning_rate": 6.302509405831887e-07, + "loss": 1.4322, + "step": 3099 + }, + { + "epoch": 0.2160052956136989, + "grad_norm": 0.7069636227599213, + "learning_rate": 6.302076605932402e-07, + "loss": 1.5997, + "step": 3100 + }, + { + "epoch": 0.21607497474131623, + "grad_norm": 0.7333395774155352, + "learning_rate": 6.301643688524372e-07, + "loss": 1.5292, + "step": 3101 + }, + { + "epoch": 0.21614465386893356, + "grad_norm": 0.7868825512640599, + "learning_rate": 6.301210653628545e-07, + "loss": 1.5164, + "step": 3102 + }, + { + "epoch": 0.21621433299655088, + "grad_norm": 0.6477530449996382, + "learning_rate": 6.300777501265669e-07, + "loss": 1.4932, + "step": 3103 + }, + { + "epoch": 0.2162840121241682, + "grad_norm": 0.7382703043519941, + "learning_rate": 6.300344231456505e-07, + "loss": 1.5096, + "step": 3104 + }, + { + "epoch": 0.21635369125178552, + "grad_norm": 0.7197826816393428, + "learning_rate": 6.299910844221815e-07, + "loss": 1.5127, + "step": 3105 + }, + { + "epoch": 0.21642337037940285, + "grad_norm": 0.7162601287647885, + "learning_rate": 6.299477339582367e-07, + "loss": 1.5613, + "step": 3106 + }, + { + "epoch": 0.21649304950702017, + "grad_norm": 0.6993308172485544, + "learning_rate": 6.299043717558937e-07, + "loss": 1.3961, + "step": 3107 + }, + { + "epoch": 0.2165627286346375, + "grad_norm": 0.7050100563001422, + "learning_rate": 6.298609978172303e-07, + "loss": 1.522, + "step": 3108 + }, + { + "epoch": 0.2166324077622548, + "grad_norm": 0.7030621104722599, + "learning_rate": 6.298176121443252e-07, + "loss": 1.5093, + "step": 3109 + }, + { + "epoch": 0.21670208688987214, + "grad_norm": 0.6658177312948771, + "learning_rate": 6.297742147392577e-07, + "loss": 1.5186, + "step": 3110 + }, + { + "epoch": 0.21677176601748946, + "grad_norm": 0.6695545637176513, + "learning_rate": 6.297308056041072e-07, + "loss": 1.4912, + "step": 3111 + }, + { + "epoch": 0.21684144514510678, + "grad_norm": 0.6955784740818475, + "learning_rate": 6.296873847409542e-07, + "loss": 1.5496, + "step": 3112 + }, + { + "epoch": 0.2169111242727241, + "grad_norm": 0.6878524383136772, + "learning_rate": 6.296439521518792e-07, + "loss": 1.5043, + "step": 3113 + }, + { + "epoch": 0.21698080340034143, + "grad_norm": 0.7220128624196103, + "learning_rate": 6.296005078389639e-07, + "loss": 1.5356, + "step": 3114 + }, + { + "epoch": 0.21705048252795875, + "grad_norm": 0.6693205323609417, + "learning_rate": 6.2955705180429e-07, + "loss": 1.3807, + "step": 3115 + }, + { + "epoch": 0.21712016165557607, + "grad_norm": 0.68769141490313, + "learning_rate": 6.295135840499401e-07, + "loss": 1.6205, + "step": 3116 + }, + { + "epoch": 0.2171898407831934, + "grad_norm": 0.7197082361841207, + "learning_rate": 6.294701045779974e-07, + "loss": 1.4412, + "step": 3117 + }, + { + "epoch": 0.21725951991081072, + "grad_norm": 0.7310208796664356, + "learning_rate": 6.294266133905453e-07, + "loss": 1.612, + "step": 3118 + }, + { + "epoch": 0.21732919903842804, + "grad_norm": 0.7665599518177661, + "learning_rate": 6.293831104896682e-07, + "loss": 1.5117, + "step": 3119 + }, + { + "epoch": 0.21739887816604536, + "grad_norm": 0.7444580585825434, + "learning_rate": 6.293395958774507e-07, + "loss": 1.6397, + "step": 3120 + }, + { + "epoch": 0.21746855729366268, + "grad_norm": 0.7523077640557353, + "learning_rate": 6.29296069555978e-07, + "loss": 1.5666, + "step": 3121 + }, + { + "epoch": 0.21753823642128, + "grad_norm": 0.7124509029071153, + "learning_rate": 6.292525315273363e-07, + "loss": 1.3989, + "step": 3122 + }, + { + "epoch": 0.21760791554889733, + "grad_norm": 0.6772900926590302, + "learning_rate": 6.292089817936119e-07, + "loss": 1.4556, + "step": 3123 + }, + { + "epoch": 0.21767759467651465, + "grad_norm": 0.6820042166343526, + "learning_rate": 6.291654203568915e-07, + "loss": 1.4178, + "step": 3124 + }, + { + "epoch": 0.21774727380413197, + "grad_norm": 0.7656657737383707, + "learning_rate": 6.29121847219263e-07, + "loss": 1.5057, + "step": 3125 + }, + { + "epoch": 0.2178169529317493, + "grad_norm": 0.7702721728012195, + "learning_rate": 6.290782623828146e-07, + "loss": 1.7169, + "step": 3126 + }, + { + "epoch": 0.21788663205936662, + "grad_norm": 0.7139401052066671, + "learning_rate": 6.290346658496345e-07, + "loss": 1.4959, + "step": 3127 + }, + { + "epoch": 0.21795631118698394, + "grad_norm": 0.6650157900893546, + "learning_rate": 6.289910576218124e-07, + "loss": 1.5168, + "step": 3128 + }, + { + "epoch": 0.21802599031460126, + "grad_norm": 0.7229572770264344, + "learning_rate": 6.289474377014378e-07, + "loss": 1.5193, + "step": 3129 + }, + { + "epoch": 0.2180956694422186, + "grad_norm": 0.7516357460383902, + "learning_rate": 6.289038060906011e-07, + "loss": 1.6042, + "step": 3130 + }, + { + "epoch": 0.2181653485698359, + "grad_norm": 0.6990707991385572, + "learning_rate": 6.288601627913935e-07, + "loss": 1.5118, + "step": 3131 + }, + { + "epoch": 0.21823502769745323, + "grad_norm": 0.7402279645798079, + "learning_rate": 6.288165078059062e-07, + "loss": 1.5159, + "step": 3132 + }, + { + "epoch": 0.21830470682507055, + "grad_norm": 0.7071125829282526, + "learning_rate": 6.287728411362312e-07, + "loss": 1.4857, + "step": 3133 + }, + { + "epoch": 0.21837438595268788, + "grad_norm": 0.7145022706649667, + "learning_rate": 6.287291627844613e-07, + "loss": 1.4423, + "step": 3134 + }, + { + "epoch": 0.2184440650803052, + "grad_norm": 0.7153095213061711, + "learning_rate": 6.286854727526895e-07, + "loss": 1.5401, + "step": 3135 + }, + { + "epoch": 0.21851374420792252, + "grad_norm": 0.7388320724188436, + "learning_rate": 6.286417710430096e-07, + "loss": 1.5551, + "step": 3136 + }, + { + "epoch": 0.21858342333553984, + "grad_norm": 0.7351605398058645, + "learning_rate": 6.285980576575158e-07, + "loss": 1.5743, + "step": 3137 + }, + { + "epoch": 0.21865310246315717, + "grad_norm": 0.7103834284273812, + "learning_rate": 6.28554332598303e-07, + "loss": 1.62, + "step": 3138 + }, + { + "epoch": 0.2187227815907745, + "grad_norm": 0.7124917000500621, + "learning_rate": 6.285105958674667e-07, + "loss": 1.5462, + "step": 3139 + }, + { + "epoch": 0.2187924607183918, + "grad_norm": 0.6719455946928833, + "learning_rate": 6.284668474671026e-07, + "loss": 1.4454, + "step": 3140 + }, + { + "epoch": 0.21886213984600913, + "grad_norm": 0.7342723618991475, + "learning_rate": 6.284230873993073e-07, + "loss": 1.5297, + "step": 3141 + }, + { + "epoch": 0.21893181897362646, + "grad_norm": 0.7050821767435831, + "learning_rate": 6.28379315666178e-07, + "loss": 1.4949, + "step": 3142 + }, + { + "epoch": 0.21900149810124378, + "grad_norm": 0.7305929985653432, + "learning_rate": 6.283355322698121e-07, + "loss": 1.5449, + "step": 3143 + }, + { + "epoch": 0.2190711772288611, + "grad_norm": 0.7355604484531012, + "learning_rate": 6.282917372123081e-07, + "loss": 1.631, + "step": 3144 + }, + { + "epoch": 0.21914085635647843, + "grad_norm": 0.6723855105370937, + "learning_rate": 6.282479304957646e-07, + "loss": 1.521, + "step": 3145 + }, + { + "epoch": 0.21921053548409575, + "grad_norm": 0.7500049197897934, + "learning_rate": 6.282041121222808e-07, + "loss": 1.4776, + "step": 3146 + }, + { + "epoch": 0.21928021461171307, + "grad_norm": 0.705067231005872, + "learning_rate": 6.281602820939566e-07, + "loss": 1.4577, + "step": 3147 + }, + { + "epoch": 0.2193498937393304, + "grad_norm": 0.7447843195035967, + "learning_rate": 6.281164404128927e-07, + "loss": 1.464, + "step": 3148 + }, + { + "epoch": 0.21941957286694772, + "grad_norm": 0.6669666094624056, + "learning_rate": 6.280725870811896e-07, + "loss": 1.4811, + "step": 3149 + }, + { + "epoch": 0.21948925199456504, + "grad_norm": 0.7372591631824635, + "learning_rate": 6.280287221009493e-07, + "loss": 1.4874, + "step": 3150 + }, + { + "epoch": 0.21955893112218236, + "grad_norm": 0.7400195760354873, + "learning_rate": 6.279848454742736e-07, + "loss": 1.5233, + "step": 3151 + }, + { + "epoch": 0.21962861024979968, + "grad_norm": 0.7100152026287823, + "learning_rate": 6.279409572032653e-07, + "loss": 1.5391, + "step": 3152 + }, + { + "epoch": 0.219698289377417, + "grad_norm": 0.6998698777309963, + "learning_rate": 6.278970572900277e-07, + "loss": 1.5045, + "step": 3153 + }, + { + "epoch": 0.21976796850503433, + "grad_norm": 0.8130378158487237, + "learning_rate": 6.278531457366644e-07, + "loss": 1.5623, + "step": 3154 + }, + { + "epoch": 0.21983764763265165, + "grad_norm": 0.7189054680097751, + "learning_rate": 6.278092225452796e-07, + "loss": 1.4549, + "step": 3155 + }, + { + "epoch": 0.21990732676026897, + "grad_norm": 0.7060714272932562, + "learning_rate": 6.277652877179786e-07, + "loss": 1.6297, + "step": 3156 + }, + { + "epoch": 0.2199770058878863, + "grad_norm": 0.7851668596549601, + "learning_rate": 6.277213412568665e-07, + "loss": 1.5901, + "step": 3157 + }, + { + "epoch": 0.22004668501550362, + "grad_norm": 0.721129749016274, + "learning_rate": 6.276773831640495e-07, + "loss": 1.5617, + "step": 3158 + }, + { + "epoch": 0.22011636414312094, + "grad_norm": 0.7291404792430866, + "learning_rate": 6.276334134416341e-07, + "loss": 1.6845, + "step": 3159 + }, + { + "epoch": 0.22018604327073826, + "grad_norm": 0.6621504202587892, + "learning_rate": 6.275894320917273e-07, + "loss": 1.5213, + "step": 3160 + }, + { + "epoch": 0.22025572239835559, + "grad_norm": 0.7520204029846614, + "learning_rate": 6.27545439116437e-07, + "loss": 1.5872, + "step": 3161 + }, + { + "epoch": 0.2203254015259729, + "grad_norm": 0.7445862369015398, + "learning_rate": 6.275014345178713e-07, + "loss": 1.5763, + "step": 3162 + }, + { + "epoch": 0.22039508065359023, + "grad_norm": 0.7290777915425307, + "learning_rate": 6.27457418298139e-07, + "loss": 1.6284, + "step": 3163 + }, + { + "epoch": 0.22046475978120753, + "grad_norm": 0.752459227931593, + "learning_rate": 6.274133904593496e-07, + "loss": 1.6027, + "step": 3164 + }, + { + "epoch": 0.22053443890882485, + "grad_norm": 0.6618546415584925, + "learning_rate": 6.273693510036126e-07, + "loss": 1.5229, + "step": 3165 + }, + { + "epoch": 0.22060411803644217, + "grad_norm": 0.6864959676253173, + "learning_rate": 6.273252999330389e-07, + "loss": 1.6194, + "step": 3166 + }, + { + "epoch": 0.2206737971640595, + "grad_norm": 0.8145056745835206, + "learning_rate": 6.272812372497392e-07, + "loss": 1.3994, + "step": 3167 + }, + { + "epoch": 0.22074347629167682, + "grad_norm": 0.7224351871692737, + "learning_rate": 6.272371629558254e-07, + "loss": 1.6291, + "step": 3168 + }, + { + "epoch": 0.22081315541929414, + "grad_norm": 0.752897078662678, + "learning_rate": 6.271930770534093e-07, + "loss": 1.6119, + "step": 3169 + }, + { + "epoch": 0.22088283454691146, + "grad_norm": 0.70856599701562, + "learning_rate": 6.271489795446038e-07, + "loss": 1.6155, + "step": 3170 + }, + { + "epoch": 0.22095251367452878, + "grad_norm": 0.755885503203361, + "learning_rate": 6.27104870431522e-07, + "loss": 1.6431, + "step": 3171 + }, + { + "epoch": 0.2210221928021461, + "grad_norm": 0.7865730300687626, + "learning_rate": 6.27060749716278e-07, + "loss": 1.4305, + "step": 3172 + }, + { + "epoch": 0.22109187192976343, + "grad_norm": 0.7013495707737089, + "learning_rate": 6.270166174009857e-07, + "loss": 1.4782, + "step": 3173 + }, + { + "epoch": 0.22116155105738075, + "grad_norm": 0.6739764223939262, + "learning_rate": 6.269724734877604e-07, + "loss": 1.5228, + "step": 3174 + }, + { + "epoch": 0.22123123018499807, + "grad_norm": 0.7092554794529752, + "learning_rate": 6.269283179787171e-07, + "loss": 1.5998, + "step": 3175 + }, + { + "epoch": 0.2213009093126154, + "grad_norm": 0.68071132595703, + "learning_rate": 6.268841508759725e-07, + "loss": 1.5796, + "step": 3176 + }, + { + "epoch": 0.22137058844023272, + "grad_norm": 0.6705866455284171, + "learning_rate": 6.268399721816427e-07, + "loss": 1.5333, + "step": 3177 + }, + { + "epoch": 0.22144026756785004, + "grad_norm": 0.7316307681953858, + "learning_rate": 6.267957818978449e-07, + "loss": 1.5732, + "step": 3178 + }, + { + "epoch": 0.22150994669546736, + "grad_norm": 0.7036388021018205, + "learning_rate": 6.267515800266969e-07, + "loss": 1.4423, + "step": 3179 + }, + { + "epoch": 0.22157962582308469, + "grad_norm": 0.701486644574639, + "learning_rate": 6.267073665703168e-07, + "loss": 1.5843, + "step": 3180 + }, + { + "epoch": 0.221649304950702, + "grad_norm": 0.7315723398194484, + "learning_rate": 6.266631415308236e-07, + "loss": 1.5913, + "step": 3181 + }, + { + "epoch": 0.22171898407831933, + "grad_norm": 0.7610730029702969, + "learning_rate": 6.266189049103364e-07, + "loss": 1.5173, + "step": 3182 + }, + { + "epoch": 0.22178866320593665, + "grad_norm": 0.7267234976534973, + "learning_rate": 6.265746567109752e-07, + "loss": 1.5464, + "step": 3183 + }, + { + "epoch": 0.22185834233355398, + "grad_norm": 0.7581939561966579, + "learning_rate": 6.265303969348606e-07, + "loss": 1.5121, + "step": 3184 + }, + { + "epoch": 0.2219280214611713, + "grad_norm": 0.7818869908905624, + "learning_rate": 6.264861255841136e-07, + "loss": 1.6076, + "step": 3185 + }, + { + "epoch": 0.22199770058878862, + "grad_norm": 0.7334237426975201, + "learning_rate": 6.264418426608556e-07, + "loss": 1.5393, + "step": 3186 + }, + { + "epoch": 0.22206737971640594, + "grad_norm": 0.666609199162287, + "learning_rate": 6.263975481672088e-07, + "loss": 1.4628, + "step": 3187 + }, + { + "epoch": 0.22213705884402327, + "grad_norm": 0.7218983139497597, + "learning_rate": 6.26353242105296e-07, + "loss": 1.5396, + "step": 3188 + }, + { + "epoch": 0.2222067379716406, + "grad_norm": 0.7243553909759005, + "learning_rate": 6.263089244772403e-07, + "loss": 1.4566, + "step": 3189 + }, + { + "epoch": 0.2222764170992579, + "grad_norm": 0.7754149969874606, + "learning_rate": 6.262645952851656e-07, + "loss": 1.6057, + "step": 3190 + }, + { + "epoch": 0.22234609622687523, + "grad_norm": 0.7007605347212531, + "learning_rate": 6.262202545311961e-07, + "loss": 1.5546, + "step": 3191 + }, + { + "epoch": 0.22241577535449256, + "grad_norm": 0.7193548432527994, + "learning_rate": 6.261759022174569e-07, + "loss": 1.5214, + "step": 3192 + }, + { + "epoch": 0.22248545448210988, + "grad_norm": 0.7430888845944795, + "learning_rate": 6.261315383460732e-07, + "loss": 1.5679, + "step": 3193 + }, + { + "epoch": 0.2225551336097272, + "grad_norm": 0.7067278834368642, + "learning_rate": 6.260871629191712e-07, + "loss": 1.6559, + "step": 3194 + }, + { + "epoch": 0.22262481273734452, + "grad_norm": 0.679596178691687, + "learning_rate": 6.260427759388774e-07, + "loss": 1.4959, + "step": 3195 + }, + { + "epoch": 0.22269449186496185, + "grad_norm": 0.736290631526461, + "learning_rate": 6.25998377407319e-07, + "loss": 1.4611, + "step": 3196 + }, + { + "epoch": 0.22276417099257917, + "grad_norm": 0.7230193809098943, + "learning_rate": 6.259539673266236e-07, + "loss": 1.5939, + "step": 3197 + }, + { + "epoch": 0.2228338501201965, + "grad_norm": 0.7508187483513579, + "learning_rate": 6.259095456989193e-07, + "loss": 1.5279, + "step": 3198 + }, + { + "epoch": 0.22290352924781381, + "grad_norm": 0.7190325739384879, + "learning_rate": 6.258651125263351e-07, + "loss": 1.5578, + "step": 3199 + }, + { + "epoch": 0.22297320837543114, + "grad_norm": 0.6868033846137466, + "learning_rate": 6.258206678110001e-07, + "loss": 1.5303, + "step": 3200 + }, + { + "epoch": 0.22304288750304846, + "grad_norm": 0.7103286386533224, + "learning_rate": 6.257762115550445e-07, + "loss": 1.5063, + "step": 3201 + }, + { + "epoch": 0.22311256663066578, + "grad_norm": 0.7159508152038254, + "learning_rate": 6.257317437605984e-07, + "loss": 1.5417, + "step": 3202 + }, + { + "epoch": 0.2231822457582831, + "grad_norm": 0.754030296647133, + "learning_rate": 6.256872644297928e-07, + "loss": 1.593, + "step": 3203 + }, + { + "epoch": 0.22325192488590043, + "grad_norm": 0.7129339711853596, + "learning_rate": 6.256427735647596e-07, + "loss": 1.5354, + "step": 3204 + }, + { + "epoch": 0.22332160401351775, + "grad_norm": 0.7398839096789644, + "learning_rate": 6.255982711676306e-07, + "loss": 1.6543, + "step": 3205 + }, + { + "epoch": 0.22339128314113507, + "grad_norm": 0.7076361329437126, + "learning_rate": 6.255537572405385e-07, + "loss": 1.5077, + "step": 3206 + }, + { + "epoch": 0.2234609622687524, + "grad_norm": 0.7558888533563315, + "learning_rate": 6.255092317856164e-07, + "loss": 1.5365, + "step": 3207 + }, + { + "epoch": 0.22353064139636972, + "grad_norm": 0.6913289274165797, + "learning_rate": 6.254646948049982e-07, + "loss": 1.4091, + "step": 3208 + }, + { + "epoch": 0.22360032052398704, + "grad_norm": 0.6862729311578976, + "learning_rate": 6.254201463008183e-07, + "loss": 1.5289, + "step": 3209 + }, + { + "epoch": 0.22366999965160436, + "grad_norm": 0.7032564177413655, + "learning_rate": 6.253755862752113e-07, + "loss": 1.6496, + "step": 3210 + }, + { + "epoch": 0.22373967877922168, + "grad_norm": 0.672748516405407, + "learning_rate": 6.253310147303128e-07, + "loss": 1.3317, + "step": 3211 + }, + { + "epoch": 0.223809357906839, + "grad_norm": 0.6511809527835895, + "learning_rate": 6.252864316682587e-07, + "loss": 1.4124, + "step": 3212 + }, + { + "epoch": 0.22387903703445633, + "grad_norm": 0.785166870644245, + "learning_rate": 6.252418370911853e-07, + "loss": 1.4795, + "step": 3213 + }, + { + "epoch": 0.22394871616207365, + "grad_norm": 0.7156469515997128, + "learning_rate": 6.2519723100123e-07, + "loss": 1.61, + "step": 3214 + }, + { + "epoch": 0.22401839528969097, + "grad_norm": 0.7322960624451204, + "learning_rate": 6.251526134005302e-07, + "loss": 1.6137, + "step": 3215 + }, + { + "epoch": 0.2240880744173083, + "grad_norm": 0.7298373702823864, + "learning_rate": 6.251079842912242e-07, + "loss": 1.5798, + "step": 3216 + }, + { + "epoch": 0.22415775354492562, + "grad_norm": 0.7420559329181559, + "learning_rate": 6.250633436754507e-07, + "loss": 1.569, + "step": 3217 + }, + { + "epoch": 0.22422743267254294, + "grad_norm": 0.6912194583934528, + "learning_rate": 6.25018691555349e-07, + "loss": 1.5474, + "step": 3218 + }, + { + "epoch": 0.22429711180016026, + "grad_norm": 0.8192448526866579, + "learning_rate": 6.249740279330586e-07, + "loss": 1.691, + "step": 3219 + }, + { + "epoch": 0.2243667909277776, + "grad_norm": 0.712953611035292, + "learning_rate": 6.249293528107203e-07, + "loss": 1.5323, + "step": 3220 + }, + { + "epoch": 0.2244364700553949, + "grad_norm": 0.7296944378767534, + "learning_rate": 6.248846661904748e-07, + "loss": 1.5539, + "step": 3221 + }, + { + "epoch": 0.22450614918301223, + "grad_norm": 0.7445076520950352, + "learning_rate": 6.248399680744637e-07, + "loss": 1.5898, + "step": 3222 + }, + { + "epoch": 0.22457582831062955, + "grad_norm": 0.7085715501497795, + "learning_rate": 6.247952584648289e-07, + "loss": 1.6631, + "step": 3223 + }, + { + "epoch": 0.22464550743824688, + "grad_norm": 0.73313301704967, + "learning_rate": 6.247505373637131e-07, + "loss": 1.6425, + "step": 3224 + }, + { + "epoch": 0.2247151865658642, + "grad_norm": 0.7468409271178961, + "learning_rate": 6.247058047732591e-07, + "loss": 1.4843, + "step": 3225 + }, + { + "epoch": 0.22478486569348152, + "grad_norm": 0.7406310185623337, + "learning_rate": 6.24661060695611e-07, + "loss": 1.5649, + "step": 3226 + }, + { + "epoch": 0.22485454482109885, + "grad_norm": 0.7286940068337966, + "learning_rate": 6.246163051329129e-07, + "loss": 1.5882, + "step": 3227 + }, + { + "epoch": 0.22492422394871617, + "grad_norm": 0.7416579984863053, + "learning_rate": 6.245715380873094e-07, + "loss": 1.5131, + "step": 3228 + }, + { + "epoch": 0.2249939030763335, + "grad_norm": 0.678153652186507, + "learning_rate": 6.245267595609461e-07, + "loss": 1.4576, + "step": 3229 + }, + { + "epoch": 0.2250635822039508, + "grad_norm": 0.7760279305504684, + "learning_rate": 6.244819695559686e-07, + "loss": 1.4768, + "step": 3230 + }, + { + "epoch": 0.22513326133156814, + "grad_norm": 0.6735251740482139, + "learning_rate": 6.244371680745236e-07, + "loss": 1.4129, + "step": 3231 + }, + { + "epoch": 0.22520294045918546, + "grad_norm": 0.7608296780045913, + "learning_rate": 6.243923551187581e-07, + "loss": 1.5914, + "step": 3232 + }, + { + "epoch": 0.22527261958680278, + "grad_norm": 0.7262753496045481, + "learning_rate": 6.243475306908191e-07, + "loss": 1.5632, + "step": 3233 + }, + { + "epoch": 0.2253422987144201, + "grad_norm": 0.7168146417665728, + "learning_rate": 6.243026947928552e-07, + "loss": 1.5634, + "step": 3234 + }, + { + "epoch": 0.22541197784203743, + "grad_norm": 0.7666596628392616, + "learning_rate": 6.24257847427015e-07, + "loss": 1.5986, + "step": 3235 + }, + { + "epoch": 0.22548165696965475, + "grad_norm": 0.7096138543338777, + "learning_rate": 6.242129885954475e-07, + "loss": 1.5359, + "step": 3236 + }, + { + "epoch": 0.22555133609727207, + "grad_norm": 0.7387515003643389, + "learning_rate": 6.241681183003026e-07, + "loss": 1.533, + "step": 3237 + }, + { + "epoch": 0.2256210152248894, + "grad_norm": 0.6686624505246506, + "learning_rate": 6.241232365437303e-07, + "loss": 1.4926, + "step": 3238 + }, + { + "epoch": 0.22569069435250672, + "grad_norm": 0.7244864005371732, + "learning_rate": 6.240783433278816e-07, + "loss": 1.5109, + "step": 3239 + }, + { + "epoch": 0.22576037348012404, + "grad_norm": 0.7300898381688603, + "learning_rate": 6.240334386549079e-07, + "loss": 1.4947, + "step": 3240 + }, + { + "epoch": 0.22583005260774136, + "grad_norm": 0.694543083655283, + "learning_rate": 6.239885225269611e-07, + "loss": 1.5542, + "step": 3241 + }, + { + "epoch": 0.22589973173535868, + "grad_norm": 0.6938706067751353, + "learning_rate": 6.239435949461937e-07, + "loss": 1.5457, + "step": 3242 + }, + { + "epoch": 0.225969410862976, + "grad_norm": 0.8120680247426197, + "learning_rate": 6.238986559147587e-07, + "loss": 1.4822, + "step": 3243 + }, + { + "epoch": 0.22603908999059333, + "grad_norm": 1.0393662763357927, + "learning_rate": 6.238537054348097e-07, + "loss": 1.5993, + "step": 3244 + }, + { + "epoch": 0.22610876911821065, + "grad_norm": 0.6939123246891056, + "learning_rate": 6.238087435085006e-07, + "loss": 1.5299, + "step": 3245 + }, + { + "epoch": 0.22617844824582797, + "grad_norm": 0.7103542178040833, + "learning_rate": 6.237637701379864e-07, + "loss": 1.5642, + "step": 3246 + }, + { + "epoch": 0.2262481273734453, + "grad_norm": 0.6790085012080815, + "learning_rate": 6.237187853254221e-07, + "loss": 1.4859, + "step": 3247 + }, + { + "epoch": 0.22631780650106262, + "grad_norm": 0.7613369388575224, + "learning_rate": 6.236737890729635e-07, + "loss": 1.5173, + "step": 3248 + }, + { + "epoch": 0.22638748562867994, + "grad_norm": 0.7266168413309799, + "learning_rate": 6.23628781382767e-07, + "loss": 1.5771, + "step": 3249 + }, + { + "epoch": 0.22645716475629726, + "grad_norm": 0.7838417336138501, + "learning_rate": 6.235837622569894e-07, + "loss": 1.6464, + "step": 3250 + }, + { + "epoch": 0.22652684388391459, + "grad_norm": 0.6878127982873993, + "learning_rate": 6.235387316977881e-07, + "loss": 1.4028, + "step": 3251 + }, + { + "epoch": 0.2265965230115319, + "grad_norm": 0.6759827777027689, + "learning_rate": 6.23493689707321e-07, + "loss": 1.4111, + "step": 3252 + }, + { + "epoch": 0.22666620213914923, + "grad_norm": 0.7563518707403992, + "learning_rate": 6.234486362877468e-07, + "loss": 1.6306, + "step": 3253 + }, + { + "epoch": 0.22673588126676655, + "grad_norm": 0.705109375720128, + "learning_rate": 6.234035714412243e-07, + "loss": 1.4413, + "step": 3254 + }, + { + "epoch": 0.22680556039438388, + "grad_norm": 0.6972099729276918, + "learning_rate": 6.233584951699133e-07, + "loss": 1.5258, + "step": 3255 + }, + { + "epoch": 0.22687523952200117, + "grad_norm": 0.7448910365246851, + "learning_rate": 6.233134074759739e-07, + "loss": 1.5063, + "step": 3256 + }, + { + "epoch": 0.2269449186496185, + "grad_norm": 0.7259674297447937, + "learning_rate": 6.232683083615668e-07, + "loss": 1.6128, + "step": 3257 + }, + { + "epoch": 0.22701459777723582, + "grad_norm": 0.6804071352169611, + "learning_rate": 6.23223197828853e-07, + "loss": 1.4106, + "step": 3258 + }, + { + "epoch": 0.22708427690485314, + "grad_norm": 0.7116889143448107, + "learning_rate": 6.231780758799946e-07, + "loss": 1.628, + "step": 3259 + }, + { + "epoch": 0.22715395603247046, + "grad_norm": 0.7161916261000766, + "learning_rate": 6.231329425171538e-07, + "loss": 1.6124, + "step": 3260 + }, + { + "epoch": 0.22722363516008778, + "grad_norm": 0.7276125930319626, + "learning_rate": 6.230877977424936e-07, + "loss": 1.5841, + "step": 3261 + }, + { + "epoch": 0.2272933142877051, + "grad_norm": 0.7186008176701532, + "learning_rate": 6.230426415581773e-07, + "loss": 1.5515, + "step": 3262 + }, + { + "epoch": 0.22736299341532243, + "grad_norm": 0.7167681974231933, + "learning_rate": 6.229974739663689e-07, + "loss": 1.6088, + "step": 3263 + }, + { + "epoch": 0.22743267254293975, + "grad_norm": 0.706065353428564, + "learning_rate": 6.229522949692331e-07, + "loss": 1.556, + "step": 3264 + }, + { + "epoch": 0.22750235167055707, + "grad_norm": 0.7269340461885619, + "learning_rate": 6.229071045689346e-07, + "loss": 1.5334, + "step": 3265 + }, + { + "epoch": 0.2275720307981744, + "grad_norm": 0.7456034153946854, + "learning_rate": 6.228619027676394e-07, + "loss": 1.4739, + "step": 3266 + }, + { + "epoch": 0.22764170992579172, + "grad_norm": 0.7358562726000603, + "learning_rate": 6.228166895675134e-07, + "loss": 1.6216, + "step": 3267 + }, + { + "epoch": 0.22771138905340904, + "grad_norm": 0.7259410956511649, + "learning_rate": 6.227714649707234e-07, + "loss": 1.5702, + "step": 3268 + }, + { + "epoch": 0.22778106818102636, + "grad_norm": 0.6678683131356811, + "learning_rate": 6.227262289794368e-07, + "loss": 1.451, + "step": 3269 + }, + { + "epoch": 0.2278507473086437, + "grad_norm": 0.8432593479988705, + "learning_rate": 6.226809815958212e-07, + "loss": 1.749, + "step": 3270 + }, + { + "epoch": 0.227920426436261, + "grad_norm": 0.6699718251478374, + "learning_rate": 6.22635722822045e-07, + "loss": 1.3672, + "step": 3271 + }, + { + "epoch": 0.22799010556387833, + "grad_norm": 0.7134528447425973, + "learning_rate": 6.22590452660277e-07, + "loss": 1.4964, + "step": 3272 + }, + { + "epoch": 0.22805978469149565, + "grad_norm": 0.8470823062532795, + "learning_rate": 6.22545171112687e-07, + "loss": 1.7031, + "step": 3273 + }, + { + "epoch": 0.22812946381911298, + "grad_norm": 0.7057268115009272, + "learning_rate": 6.224998781814445e-07, + "loss": 1.5361, + "step": 3274 + }, + { + "epoch": 0.2281991429467303, + "grad_norm": 0.7369497880685594, + "learning_rate": 6.224545738687203e-07, + "loss": 1.5211, + "step": 3275 + }, + { + "epoch": 0.22826882207434762, + "grad_norm": 0.7145201468391852, + "learning_rate": 6.224092581766854e-07, + "loss": 1.5296, + "step": 3276 + }, + { + "epoch": 0.22833850120196494, + "grad_norm": 0.7103105050401702, + "learning_rate": 6.223639311075114e-07, + "loss": 1.4471, + "step": 3277 + }, + { + "epoch": 0.22840818032958227, + "grad_norm": 0.844762788612772, + "learning_rate": 6.223185926633709e-07, + "loss": 1.5651, + "step": 3278 + }, + { + "epoch": 0.2284778594571996, + "grad_norm": 0.7034913972328873, + "learning_rate": 6.22273242846436e-07, + "loss": 1.5359, + "step": 3279 + }, + { + "epoch": 0.2285475385848169, + "grad_norm": 0.7461126329176588, + "learning_rate": 6.2222788165888e-07, + "loss": 1.6537, + "step": 3280 + }, + { + "epoch": 0.22861721771243423, + "grad_norm": 0.7269639564103746, + "learning_rate": 6.221825091028772e-07, + "loss": 1.4432, + "step": 3281 + }, + { + "epoch": 0.22868689684005156, + "grad_norm": 0.6712173022990764, + "learning_rate": 6.221371251806014e-07, + "loss": 1.5733, + "step": 3282 + }, + { + "epoch": 0.22875657596766888, + "grad_norm": 0.7475145129410613, + "learning_rate": 6.220917298942278e-07, + "loss": 1.5138, + "step": 3283 + }, + { + "epoch": 0.2288262550952862, + "grad_norm": 0.7008954768552739, + "learning_rate": 6.220463232459318e-07, + "loss": 1.4898, + "step": 3284 + }, + { + "epoch": 0.22889593422290352, + "grad_norm": 0.7248710827182362, + "learning_rate": 6.220009052378892e-07, + "loss": 1.5659, + "step": 3285 + }, + { + "epoch": 0.22896561335052085, + "grad_norm": 0.7064860142782986, + "learning_rate": 6.21955475872277e-07, + "loss": 1.5297, + "step": 3286 + }, + { + "epoch": 0.22903529247813817, + "grad_norm": 0.7221219290864086, + "learning_rate": 6.219100351512717e-07, + "loss": 1.5716, + "step": 3287 + }, + { + "epoch": 0.2291049716057555, + "grad_norm": 0.7565337053436231, + "learning_rate": 6.218645830770511e-07, + "loss": 1.6439, + "step": 3288 + }, + { + "epoch": 0.22917465073337281, + "grad_norm": 0.7723173370820656, + "learning_rate": 6.218191196517935e-07, + "loss": 1.5419, + "step": 3289 + }, + { + "epoch": 0.22924432986099014, + "grad_norm": 0.6800261948404732, + "learning_rate": 6.217736448776775e-07, + "loss": 1.5262, + "step": 3290 + }, + { + "epoch": 0.22931400898860746, + "grad_norm": 0.6742666990820785, + "learning_rate": 6.217281587568823e-07, + "loss": 1.4581, + "step": 3291 + }, + { + "epoch": 0.22938368811622478, + "grad_norm": 0.6924075637187566, + "learning_rate": 6.216826612915877e-07, + "loss": 1.5268, + "step": 3292 + }, + { + "epoch": 0.2294533672438421, + "grad_norm": 0.6770195243966041, + "learning_rate": 6.216371524839743e-07, + "loss": 1.5681, + "step": 3293 + }, + { + "epoch": 0.22952304637145943, + "grad_norm": 0.6831251809175937, + "learning_rate": 6.215916323362225e-07, + "loss": 1.4456, + "step": 3294 + }, + { + "epoch": 0.22959272549907675, + "grad_norm": 0.6829693545829544, + "learning_rate": 6.215461008505141e-07, + "loss": 1.616, + "step": 3295 + }, + { + "epoch": 0.22966240462669407, + "grad_norm": 0.6949054818073175, + "learning_rate": 6.215005580290309e-07, + "loss": 1.5678, + "step": 3296 + }, + { + "epoch": 0.2297320837543114, + "grad_norm": 0.6812757453937095, + "learning_rate": 6.214550038739554e-07, + "loss": 1.5277, + "step": 3297 + }, + { + "epoch": 0.22980176288192872, + "grad_norm": 0.6736556299964148, + "learning_rate": 6.214094383874707e-07, + "loss": 1.5254, + "step": 3298 + }, + { + "epoch": 0.22987144200954604, + "grad_norm": 0.6692168534392988, + "learning_rate": 6.213638615717605e-07, + "loss": 1.5795, + "step": 3299 + }, + { + "epoch": 0.22994112113716336, + "grad_norm": 0.7090645204789643, + "learning_rate": 6.213182734290085e-07, + "loss": 1.5528, + "step": 3300 + }, + { + "epoch": 0.23001080026478068, + "grad_norm": 0.7001362981176051, + "learning_rate": 6.212726739613998e-07, + "loss": 1.4763, + "step": 3301 + }, + { + "epoch": 0.230080479392398, + "grad_norm": 0.7586834311900935, + "learning_rate": 6.212270631711197e-07, + "loss": 1.6276, + "step": 3302 + }, + { + "epoch": 0.23015015852001533, + "grad_norm": 0.7934441272813372, + "learning_rate": 6.211814410603536e-07, + "loss": 1.5447, + "step": 3303 + }, + { + "epoch": 0.23021983764763265, + "grad_norm": 0.7482907591214504, + "learning_rate": 6.211358076312881e-07, + "loss": 1.4875, + "step": 3304 + }, + { + "epoch": 0.23028951677524997, + "grad_norm": 0.6763665819146414, + "learning_rate": 6.210901628861098e-07, + "loss": 1.4554, + "step": 3305 + }, + { + "epoch": 0.2303591959028673, + "grad_norm": 0.7312207211708315, + "learning_rate": 6.210445068270063e-07, + "loss": 1.5047, + "step": 3306 + }, + { + "epoch": 0.23042887503048462, + "grad_norm": 0.7269062813292612, + "learning_rate": 6.209988394561652e-07, + "loss": 1.6715, + "step": 3307 + }, + { + "epoch": 0.23049855415810194, + "grad_norm": 0.7149134256932522, + "learning_rate": 6.209531607757755e-07, + "loss": 1.4913, + "step": 3308 + }, + { + "epoch": 0.23056823328571927, + "grad_norm": 0.6878175606991938, + "learning_rate": 6.209074707880259e-07, + "loss": 1.4401, + "step": 3309 + }, + { + "epoch": 0.2306379124133366, + "grad_norm": 0.7116693886252153, + "learning_rate": 6.208617694951059e-07, + "loss": 1.4861, + "step": 3310 + }, + { + "epoch": 0.2307075915409539, + "grad_norm": 0.6677721593746183, + "learning_rate": 6.208160568992057e-07, + "loss": 1.575, + "step": 3311 + }, + { + "epoch": 0.23077727066857123, + "grad_norm": 0.7350179713720592, + "learning_rate": 6.20770333002516e-07, + "loss": 1.5758, + "step": 3312 + }, + { + "epoch": 0.23084694979618856, + "grad_norm": 0.6928707351994486, + "learning_rate": 6.207245978072279e-07, + "loss": 1.4518, + "step": 3313 + }, + { + "epoch": 0.23091662892380588, + "grad_norm": 0.6738940033331204, + "learning_rate": 6.206788513155331e-07, + "loss": 1.4948, + "step": 3314 + }, + { + "epoch": 0.2309863080514232, + "grad_norm": 0.7204099383196804, + "learning_rate": 6.206330935296239e-07, + "loss": 1.5013, + "step": 3315 + }, + { + "epoch": 0.23105598717904052, + "grad_norm": 0.7320799572268409, + "learning_rate": 6.205873244516931e-07, + "loss": 1.6149, + "step": 3316 + }, + { + "epoch": 0.23112566630665785, + "grad_norm": 0.679257456943743, + "learning_rate": 6.20541544083934e-07, + "loss": 1.5815, + "step": 3317 + }, + { + "epoch": 0.23119534543427517, + "grad_norm": 0.759769092880823, + "learning_rate": 6.204957524285407e-07, + "loss": 1.659, + "step": 3318 + }, + { + "epoch": 0.2312650245618925, + "grad_norm": 0.7434624189053274, + "learning_rate": 6.204499494877074e-07, + "loss": 1.5217, + "step": 3319 + }, + { + "epoch": 0.2313347036895098, + "grad_norm": 0.7239122991163198, + "learning_rate": 6.204041352636293e-07, + "loss": 1.494, + "step": 3320 + }, + { + "epoch": 0.23140438281712714, + "grad_norm": 0.730206357832636, + "learning_rate": 6.203583097585015e-07, + "loss": 1.598, + "step": 3321 + }, + { + "epoch": 0.23147406194474446, + "grad_norm": 0.7236108764408529, + "learning_rate": 6.203124729745206e-07, + "loss": 1.5986, + "step": 3322 + }, + { + "epoch": 0.23154374107236178, + "grad_norm": 0.7670495414586959, + "learning_rate": 6.202666249138827e-07, + "loss": 1.6228, + "step": 3323 + }, + { + "epoch": 0.2316134201999791, + "grad_norm": 0.7049449713350656, + "learning_rate": 6.202207655787851e-07, + "loss": 1.4829, + "step": 3324 + }, + { + "epoch": 0.23168309932759643, + "grad_norm": 0.7305992229489915, + "learning_rate": 6.201748949714257e-07, + "loss": 1.6744, + "step": 3325 + }, + { + "epoch": 0.23175277845521375, + "grad_norm": 0.7119919576125097, + "learning_rate": 6.201290130940024e-07, + "loss": 1.4452, + "step": 3326 + }, + { + "epoch": 0.23182245758283107, + "grad_norm": 0.6822509435659513, + "learning_rate": 6.200831199487141e-07, + "loss": 1.4767, + "step": 3327 + }, + { + "epoch": 0.2318921367104484, + "grad_norm": 0.7289185593864849, + "learning_rate": 6.200372155377601e-07, + "loss": 1.5892, + "step": 3328 + }, + { + "epoch": 0.23196181583806572, + "grad_norm": 0.6963543315365277, + "learning_rate": 6.199912998633401e-07, + "loss": 1.5631, + "step": 3329 + }, + { + "epoch": 0.23203149496568304, + "grad_norm": 0.6930151776226096, + "learning_rate": 6.199453729276547e-07, + "loss": 1.6462, + "step": 3330 + }, + { + "epoch": 0.23210117409330036, + "grad_norm": 0.69238854833818, + "learning_rate": 6.198994347329047e-07, + "loss": 1.5036, + "step": 3331 + }, + { + "epoch": 0.23217085322091768, + "grad_norm": 0.7474791474026496, + "learning_rate": 6.198534852812916e-07, + "loss": 1.4649, + "step": 3332 + }, + { + "epoch": 0.232240532348535, + "grad_norm": 0.725018393418341, + "learning_rate": 6.198075245750173e-07, + "loss": 1.5417, + "step": 3333 + }, + { + "epoch": 0.23231021147615233, + "grad_norm": 0.7522669532923917, + "learning_rate": 6.197615526162843e-07, + "loss": 1.5881, + "step": 3334 + }, + { + "epoch": 0.23237989060376965, + "grad_norm": 0.7084064750535294, + "learning_rate": 6.197155694072958e-07, + "loss": 1.5404, + "step": 3335 + }, + { + "epoch": 0.23244956973138697, + "grad_norm": 0.7518904247342453, + "learning_rate": 6.196695749502553e-07, + "loss": 1.6081, + "step": 3336 + }, + { + "epoch": 0.2325192488590043, + "grad_norm": 0.7462448084328533, + "learning_rate": 6.19623569247367e-07, + "loss": 1.6945, + "step": 3337 + }, + { + "epoch": 0.23258892798662162, + "grad_norm": 0.7352790334391689, + "learning_rate": 6.195775523008357e-07, + "loss": 1.6254, + "step": 3338 + }, + { + "epoch": 0.23265860711423894, + "grad_norm": 0.692602365608469, + "learning_rate": 6.195315241128664e-07, + "loss": 1.5174, + "step": 3339 + }, + { + "epoch": 0.23272828624185626, + "grad_norm": 0.7557983065691252, + "learning_rate": 6.19485484685665e-07, + "loss": 1.7174, + "step": 3340 + }, + { + "epoch": 0.2327979653694736, + "grad_norm": 0.7820184701909594, + "learning_rate": 6.194394340214378e-07, + "loss": 1.629, + "step": 3341 + }, + { + "epoch": 0.2328676444970909, + "grad_norm": 0.741342894454099, + "learning_rate": 6.193933721223916e-07, + "loss": 1.5313, + "step": 3342 + }, + { + "epoch": 0.23293732362470823, + "grad_norm": 0.78097018285349, + "learning_rate": 6.193472989907339e-07, + "loss": 1.6066, + "step": 3343 + }, + { + "epoch": 0.23300700275232555, + "grad_norm": 0.7650208326074384, + "learning_rate": 6.193012146286725e-07, + "loss": 1.5455, + "step": 3344 + }, + { + "epoch": 0.23307668187994288, + "grad_norm": 0.8015620503451505, + "learning_rate": 6.192551190384158e-07, + "loss": 1.468, + "step": 3345 + }, + { + "epoch": 0.2331463610075602, + "grad_norm": 0.7433641051339869, + "learning_rate": 6.192090122221729e-07, + "loss": 1.5386, + "step": 3346 + }, + { + "epoch": 0.23321604013517752, + "grad_norm": 0.7174021653327852, + "learning_rate": 6.191628941821534e-07, + "loss": 1.7446, + "step": 3347 + }, + { + "epoch": 0.23328571926279482, + "grad_norm": 0.6836406889381682, + "learning_rate": 6.191167649205672e-07, + "loss": 1.5269, + "step": 3348 + }, + { + "epoch": 0.23335539839041214, + "grad_norm": 0.7099080109048698, + "learning_rate": 6.190706244396251e-07, + "loss": 1.5104, + "step": 3349 + }, + { + "epoch": 0.23342507751802946, + "grad_norm": 0.663174403385495, + "learning_rate": 6.19024472741538e-07, + "loss": 1.5063, + "step": 3350 + }, + { + "epoch": 0.23349475664564678, + "grad_norm": 0.7500897550374107, + "learning_rate": 6.189783098285178e-07, + "loss": 1.5334, + "step": 3351 + }, + { + "epoch": 0.2335644357732641, + "grad_norm": 0.7258609882745846, + "learning_rate": 6.189321357027766e-07, + "loss": 1.4554, + "step": 3352 + }, + { + "epoch": 0.23363411490088143, + "grad_norm": 0.7432031040341716, + "learning_rate": 6.188859503665272e-07, + "loss": 1.6481, + "step": 3353 + }, + { + "epoch": 0.23370379402849875, + "grad_norm": 0.7166121656267893, + "learning_rate": 6.188397538219829e-07, + "loss": 1.595, + "step": 3354 + }, + { + "epoch": 0.23377347315611607, + "grad_norm": 0.6927380416464944, + "learning_rate": 6.187935460713575e-07, + "loss": 1.4588, + "step": 3355 + }, + { + "epoch": 0.2338431522837334, + "grad_norm": 0.7540129503422559, + "learning_rate": 6.187473271168655e-07, + "loss": 1.5675, + "step": 3356 + }, + { + "epoch": 0.23391283141135072, + "grad_norm": 0.7308119077873947, + "learning_rate": 6.187010969607217e-07, + "loss": 1.6251, + "step": 3357 + }, + { + "epoch": 0.23398251053896804, + "grad_norm": 0.6693944602398552, + "learning_rate": 6.186548556051415e-07, + "loss": 1.5107, + "step": 3358 + }, + { + "epoch": 0.23405218966658536, + "grad_norm": 0.7086992977341247, + "learning_rate": 6.18608603052341e-07, + "loss": 1.5787, + "step": 3359 + }, + { + "epoch": 0.2341218687942027, + "grad_norm": 0.6669225432306505, + "learning_rate": 6.185623393045367e-07, + "loss": 1.5375, + "step": 3360 + }, + { + "epoch": 0.23419154792182, + "grad_norm": 0.6538619423769466, + "learning_rate": 6.185160643639454e-07, + "loss": 1.4486, + "step": 3361 + }, + { + "epoch": 0.23426122704943733, + "grad_norm": 0.7317000886747869, + "learning_rate": 6.184697782327851e-07, + "loss": 1.5035, + "step": 3362 + }, + { + "epoch": 0.23433090617705465, + "grad_norm": 0.680179715005664, + "learning_rate": 6.184234809132737e-07, + "loss": 1.4639, + "step": 3363 + }, + { + "epoch": 0.23440058530467198, + "grad_norm": 0.8368622763484619, + "learning_rate": 6.183771724076298e-07, + "loss": 1.6463, + "step": 3364 + }, + { + "epoch": 0.2344702644322893, + "grad_norm": 0.8092067224444279, + "learning_rate": 6.183308527180727e-07, + "loss": 1.6825, + "step": 3365 + }, + { + "epoch": 0.23453994355990662, + "grad_norm": 0.727191601732615, + "learning_rate": 6.182845218468222e-07, + "loss": 1.5721, + "step": 3366 + }, + { + "epoch": 0.23460962268752394, + "grad_norm": 0.7190081206311841, + "learning_rate": 6.182381797960983e-07, + "loss": 1.4984, + "step": 3367 + }, + { + "epoch": 0.23467930181514127, + "grad_norm": 0.721597520938331, + "learning_rate": 6.181918265681221e-07, + "loss": 1.3929, + "step": 3368 + }, + { + "epoch": 0.2347489809427586, + "grad_norm": 0.7270928500070124, + "learning_rate": 6.181454621651149e-07, + "loss": 1.4872, + "step": 3369 + }, + { + "epoch": 0.2348186600703759, + "grad_norm": 0.7859246873980587, + "learning_rate": 6.180990865892984e-07, + "loss": 1.6426, + "step": 3370 + }, + { + "epoch": 0.23488833919799323, + "grad_norm": 0.7658917026133951, + "learning_rate": 6.180526998428953e-07, + "loss": 1.4258, + "step": 3371 + }, + { + "epoch": 0.23495801832561056, + "grad_norm": 0.7703297932331463, + "learning_rate": 6.180063019281282e-07, + "loss": 1.7079, + "step": 3372 + }, + { + "epoch": 0.23502769745322788, + "grad_norm": 0.7604510847538044, + "learning_rate": 6.179598928472208e-07, + "loss": 1.6182, + "step": 3373 + }, + { + "epoch": 0.2350973765808452, + "grad_norm": 0.7085186689342032, + "learning_rate": 6.179134726023971e-07, + "loss": 1.5748, + "step": 3374 + }, + { + "epoch": 0.23516705570846252, + "grad_norm": 0.7427333380583586, + "learning_rate": 6.178670411958817e-07, + "loss": 1.4754, + "step": 3375 + }, + { + "epoch": 0.23523673483607985, + "grad_norm": 0.7173447702200312, + "learning_rate": 6.178205986298996e-07, + "loss": 1.5343, + "step": 3376 + }, + { + "epoch": 0.23530641396369717, + "grad_norm": 0.7043449607322789, + "learning_rate": 6.177741449066763e-07, + "loss": 1.4961, + "step": 3377 + }, + { + "epoch": 0.2353760930913145, + "grad_norm": 0.7456152236319041, + "learning_rate": 6.177276800284382e-07, + "loss": 1.4932, + "step": 3378 + }, + { + "epoch": 0.23544577221893181, + "grad_norm": 0.7424025461195325, + "learning_rate": 6.176812039974119e-07, + "loss": 1.4947, + "step": 3379 + }, + { + "epoch": 0.23551545134654914, + "grad_norm": 0.7232428004855358, + "learning_rate": 6.176347168158246e-07, + "loss": 1.552, + "step": 3380 + }, + { + "epoch": 0.23558513047416646, + "grad_norm": 0.6779956331594822, + "learning_rate": 6.175882184859041e-07, + "loss": 1.4757, + "step": 3381 + }, + { + "epoch": 0.23565480960178378, + "grad_norm": 0.7171144453890865, + "learning_rate": 6.175417090098787e-07, + "loss": 1.5146, + "step": 3382 + }, + { + "epoch": 0.2357244887294011, + "grad_norm": 0.7121967513830976, + "learning_rate": 6.174951883899771e-07, + "loss": 1.5433, + "step": 3383 + }, + { + "epoch": 0.23579416785701843, + "grad_norm": 0.6891281985086806, + "learning_rate": 6.17448656628429e-07, + "loss": 1.6011, + "step": 3384 + }, + { + "epoch": 0.23586384698463575, + "grad_norm": 0.6951321103009805, + "learning_rate": 6.174021137274638e-07, + "loss": 1.486, + "step": 3385 + }, + { + "epoch": 0.23593352611225307, + "grad_norm": 0.7260711549908915, + "learning_rate": 6.173555596893123e-07, + "loss": 1.4865, + "step": 3386 + }, + { + "epoch": 0.2360032052398704, + "grad_norm": 0.6964511042381395, + "learning_rate": 6.173089945162053e-07, + "loss": 1.5472, + "step": 3387 + }, + { + "epoch": 0.23607288436748772, + "grad_norm": 0.7433377677254371, + "learning_rate": 6.172624182103744e-07, + "loss": 1.5743, + "step": 3388 + }, + { + "epoch": 0.23614256349510504, + "grad_norm": 0.7242210025791754, + "learning_rate": 6.172158307740517e-07, + "loss": 1.5983, + "step": 3389 + }, + { + "epoch": 0.23621224262272236, + "grad_norm": 0.7789543820644682, + "learning_rate": 6.171692322094696e-07, + "loss": 1.4545, + "step": 3390 + }, + { + "epoch": 0.23628192175033969, + "grad_norm": 0.751532733815692, + "learning_rate": 6.171226225188612e-07, + "loss": 1.5104, + "step": 3391 + }, + { + "epoch": 0.236351600877957, + "grad_norm": 0.7256233715460338, + "learning_rate": 6.170760017044602e-07, + "loss": 1.5217, + "step": 3392 + }, + { + "epoch": 0.23642128000557433, + "grad_norm": 0.695169404445898, + "learning_rate": 6.170293697685008e-07, + "loss": 1.4637, + "step": 3393 + }, + { + "epoch": 0.23649095913319165, + "grad_norm": 0.7268883932193495, + "learning_rate": 6.169827267132177e-07, + "loss": 1.5521, + "step": 3394 + }, + { + "epoch": 0.23656063826080898, + "grad_norm": 0.8191933964958839, + "learning_rate": 6.169360725408461e-07, + "loss": 1.5085, + "step": 3395 + }, + { + "epoch": 0.2366303173884263, + "grad_norm": 0.7463896155748065, + "learning_rate": 6.168894072536215e-07, + "loss": 1.631, + "step": 3396 + }, + { + "epoch": 0.23669999651604362, + "grad_norm": 0.7538657089938234, + "learning_rate": 6.168427308537807e-07, + "loss": 1.6949, + "step": 3397 + }, + { + "epoch": 0.23676967564366094, + "grad_norm": 0.6922451358544848, + "learning_rate": 6.167960433435602e-07, + "loss": 1.5991, + "step": 3398 + }, + { + "epoch": 0.23683935477127827, + "grad_norm": 0.7173957121519011, + "learning_rate": 6.167493447251974e-07, + "loss": 1.5608, + "step": 3399 + }, + { + "epoch": 0.2369090338988956, + "grad_norm": 0.6702020251445877, + "learning_rate": 6.167026350009302e-07, + "loss": 1.5437, + "step": 3400 + }, + { + "epoch": 0.2369787130265129, + "grad_norm": 0.8055652122242971, + "learning_rate": 6.166559141729971e-07, + "loss": 1.506, + "step": 3401 + }, + { + "epoch": 0.23704839215413023, + "grad_norm": 0.7832627703406165, + "learning_rate": 6.166091822436371e-07, + "loss": 1.6436, + "step": 3402 + }, + { + "epoch": 0.23711807128174756, + "grad_norm": 0.6791136166363546, + "learning_rate": 6.165624392150895e-07, + "loss": 1.4786, + "step": 3403 + }, + { + "epoch": 0.23718775040936488, + "grad_norm": 0.772012569377695, + "learning_rate": 6.165156850895944e-07, + "loss": 1.56, + "step": 3404 + }, + { + "epoch": 0.2372574295369822, + "grad_norm": 0.7148878873649895, + "learning_rate": 6.164689198693925e-07, + "loss": 1.5209, + "step": 3405 + }, + { + "epoch": 0.23732710866459952, + "grad_norm": 0.7661887521009225, + "learning_rate": 6.164221435567247e-07, + "loss": 1.4896, + "step": 3406 + }, + { + "epoch": 0.23739678779221685, + "grad_norm": 0.8144245815031177, + "learning_rate": 6.163753561538325e-07, + "loss": 1.3957, + "step": 3407 + }, + { + "epoch": 0.23746646691983417, + "grad_norm": 0.7057628127124091, + "learning_rate": 6.163285576629585e-07, + "loss": 1.5469, + "step": 3408 + }, + { + "epoch": 0.2375361460474515, + "grad_norm": 0.7065635665133951, + "learning_rate": 6.16281748086345e-07, + "loss": 1.6138, + "step": 3409 + }, + { + "epoch": 0.2376058251750688, + "grad_norm": 0.7702791357963328, + "learning_rate": 6.162349274262353e-07, + "loss": 1.6062, + "step": 3410 + }, + { + "epoch": 0.23767550430268614, + "grad_norm": 0.7129163308798244, + "learning_rate": 6.161880956848732e-07, + "loss": 1.5728, + "step": 3411 + }, + { + "epoch": 0.23774518343030346, + "grad_norm": 0.7367937664828406, + "learning_rate": 6.16141252864503e-07, + "loss": 1.5456, + "step": 3412 + }, + { + "epoch": 0.23781486255792078, + "grad_norm": 0.7393353115134411, + "learning_rate": 6.160943989673692e-07, + "loss": 1.5298, + "step": 3413 + }, + { + "epoch": 0.2378845416855381, + "grad_norm": 0.7583212519478039, + "learning_rate": 6.160475339957176e-07, + "loss": 1.543, + "step": 3414 + }, + { + "epoch": 0.23795422081315543, + "grad_norm": 0.7226687449053614, + "learning_rate": 6.160006579517937e-07, + "loss": 1.5804, + "step": 3415 + }, + { + "epoch": 0.23802389994077275, + "grad_norm": 0.7184778025421876, + "learning_rate": 6.159537708378441e-07, + "loss": 1.5452, + "step": 3416 + }, + { + "epoch": 0.23809357906839007, + "grad_norm": 0.7181375940406853, + "learning_rate": 6.159068726561158e-07, + "loss": 1.418, + "step": 3417 + }, + { + "epoch": 0.2381632581960074, + "grad_norm": 0.7104117242390996, + "learning_rate": 6.158599634088559e-07, + "loss": 1.538, + "step": 3418 + }, + { + "epoch": 0.23823293732362472, + "grad_norm": 0.7020110087325926, + "learning_rate": 6.158130430983127e-07, + "loss": 1.516, + "step": 3419 + }, + { + "epoch": 0.23830261645124204, + "grad_norm": 0.705561198667519, + "learning_rate": 6.157661117267347e-07, + "loss": 1.6158, + "step": 3420 + }, + { + "epoch": 0.23837229557885936, + "grad_norm": 0.6897473127244094, + "learning_rate": 6.157191692963706e-07, + "loss": 1.5299, + "step": 3421 + }, + { + "epoch": 0.23844197470647668, + "grad_norm": 0.7142173166774628, + "learning_rate": 6.156722158094705e-07, + "loss": 1.6315, + "step": 3422 + }, + { + "epoch": 0.238511653834094, + "grad_norm": 0.7311353735621938, + "learning_rate": 6.156252512682842e-07, + "loss": 1.5065, + "step": 3423 + }, + { + "epoch": 0.23858133296171133, + "grad_norm": 0.7386698453917857, + "learning_rate": 6.155782756750624e-07, + "loss": 1.5852, + "step": 3424 + }, + { + "epoch": 0.23865101208932865, + "grad_norm": 0.7174450568898096, + "learning_rate": 6.155312890320563e-07, + "loss": 1.6653, + "step": 3425 + }, + { + "epoch": 0.23872069121694597, + "grad_norm": 0.6757530045972093, + "learning_rate": 6.154842913415175e-07, + "loss": 1.519, + "step": 3426 + }, + { + "epoch": 0.2387903703445633, + "grad_norm": 0.7123687389973483, + "learning_rate": 6.154372826056983e-07, + "loss": 1.5688, + "step": 3427 + }, + { + "epoch": 0.23886004947218062, + "grad_norm": 0.731046582437802, + "learning_rate": 6.153902628268514e-07, + "loss": 1.6112, + "step": 3428 + }, + { + "epoch": 0.23892972859979794, + "grad_norm": 0.6917579901963694, + "learning_rate": 6.153432320072301e-07, + "loss": 1.5759, + "step": 3429 + }, + { + "epoch": 0.23899940772741526, + "grad_norm": 0.6981507803157918, + "learning_rate": 6.152961901490884e-07, + "loss": 1.6106, + "step": 3430 + }, + { + "epoch": 0.2390690868550326, + "grad_norm": 0.7125774274588189, + "learning_rate": 6.152491372546804e-07, + "loss": 1.5118, + "step": 3431 + }, + { + "epoch": 0.2391387659826499, + "grad_norm": 0.7006860564252603, + "learning_rate": 6.15202073326261e-07, + "loss": 1.4689, + "step": 3432 + }, + { + "epoch": 0.23920844511026723, + "grad_norm": 0.7095641180879917, + "learning_rate": 6.151549983660856e-07, + "loss": 1.4697, + "step": 3433 + }, + { + "epoch": 0.23927812423788455, + "grad_norm": 0.7259126769570512, + "learning_rate": 6.151079123764104e-07, + "loss": 1.5463, + "step": 3434 + }, + { + "epoch": 0.23934780336550188, + "grad_norm": 0.8038698655154628, + "learning_rate": 6.150608153594915e-07, + "loss": 1.5724, + "step": 3435 + }, + { + "epoch": 0.2394174824931192, + "grad_norm": 0.7405163393205098, + "learning_rate": 6.150137073175859e-07, + "loss": 1.5711, + "step": 3436 + }, + { + "epoch": 0.23948716162073652, + "grad_norm": 0.697010624031155, + "learning_rate": 6.149665882529513e-07, + "loss": 1.4704, + "step": 3437 + }, + { + "epoch": 0.23955684074835384, + "grad_norm": 0.7037022988926063, + "learning_rate": 6.149194581678457e-07, + "loss": 1.5133, + "step": 3438 + }, + { + "epoch": 0.23962651987597114, + "grad_norm": 0.7656938726176664, + "learning_rate": 6.148723170645277e-07, + "loss": 1.5413, + "step": 3439 + }, + { + "epoch": 0.23969619900358846, + "grad_norm": 0.7258567001381848, + "learning_rate": 6.148251649452564e-07, + "loss": 1.4647, + "step": 3440 + }, + { + "epoch": 0.23976587813120578, + "grad_norm": 0.721650592199456, + "learning_rate": 6.147780018122912e-07, + "loss": 1.5456, + "step": 3441 + }, + { + "epoch": 0.2398355572588231, + "grad_norm": 0.7469447720018643, + "learning_rate": 6.147308276678926e-07, + "loss": 1.6895, + "step": 3442 + }, + { + "epoch": 0.23990523638644043, + "grad_norm": 0.6885474505548068, + "learning_rate": 6.14683642514321e-07, + "loss": 1.5356, + "step": 3443 + }, + { + "epoch": 0.23997491551405775, + "grad_norm": 0.7302283015243304, + "learning_rate": 6.146364463538377e-07, + "loss": 1.6192, + "step": 3444 + }, + { + "epoch": 0.24004459464167507, + "grad_norm": 0.7402686792467148, + "learning_rate": 6.145892391887046e-07, + "loss": 1.6829, + "step": 3445 + }, + { + "epoch": 0.2401142737692924, + "grad_norm": 0.752528153723735, + "learning_rate": 6.145420210211837e-07, + "loss": 1.5519, + "step": 3446 + }, + { + "epoch": 0.24018395289690972, + "grad_norm": 0.6629770876275467, + "learning_rate": 6.144947918535379e-07, + "loss": 1.4805, + "step": 3447 + }, + { + "epoch": 0.24025363202452704, + "grad_norm": 0.699902908697308, + "learning_rate": 6.144475516880307e-07, + "loss": 1.4926, + "step": 3448 + }, + { + "epoch": 0.24032331115214436, + "grad_norm": 0.7410783032423637, + "learning_rate": 6.144003005269256e-07, + "loss": 1.4714, + "step": 3449 + }, + { + "epoch": 0.2403929902797617, + "grad_norm": 0.7367406097734193, + "learning_rate": 6.143530383724872e-07, + "loss": 1.5405, + "step": 3450 + }, + { + "epoch": 0.240462669407379, + "grad_norm": 0.771099676513623, + "learning_rate": 6.143057652269803e-07, + "loss": 1.5602, + "step": 3451 + }, + { + "epoch": 0.24053234853499633, + "grad_norm": 0.7071012761640494, + "learning_rate": 6.142584810926704e-07, + "loss": 1.5461, + "step": 3452 + }, + { + "epoch": 0.24060202766261365, + "grad_norm": 0.7170107410005159, + "learning_rate": 6.142111859718235e-07, + "loss": 1.4895, + "step": 3453 + }, + { + "epoch": 0.24067170679023098, + "grad_norm": 0.7104958509461615, + "learning_rate": 6.141638798667058e-07, + "loss": 1.4503, + "step": 3454 + }, + { + "epoch": 0.2407413859178483, + "grad_norm": 0.8801778611348274, + "learning_rate": 6.141165627795848e-07, + "loss": 1.6851, + "step": 3455 + }, + { + "epoch": 0.24081106504546562, + "grad_norm": 0.7737450201788408, + "learning_rate": 6.140692347127276e-07, + "loss": 1.4547, + "step": 3456 + }, + { + "epoch": 0.24088074417308294, + "grad_norm": 0.7241405832766921, + "learning_rate": 6.140218956684024e-07, + "loss": 1.6129, + "step": 3457 + }, + { + "epoch": 0.24095042330070027, + "grad_norm": 0.7633311415281308, + "learning_rate": 6.139745456488778e-07, + "loss": 1.6202, + "step": 3458 + }, + { + "epoch": 0.2410201024283176, + "grad_norm": 0.7854016040714885, + "learning_rate": 6.139271846564229e-07, + "loss": 1.7454, + "step": 3459 + }, + { + "epoch": 0.2410897815559349, + "grad_norm": 0.7161975046816076, + "learning_rate": 6.138798126933074e-07, + "loss": 1.6831, + "step": 3460 + }, + { + "epoch": 0.24115946068355223, + "grad_norm": 0.6605931548102567, + "learning_rate": 6.138324297618012e-07, + "loss": 1.5774, + "step": 3461 + }, + { + "epoch": 0.24122913981116956, + "grad_norm": 0.6311640853920627, + "learning_rate": 6.137850358641754e-07, + "loss": 1.4281, + "step": 3462 + }, + { + "epoch": 0.24129881893878688, + "grad_norm": 0.7140109842514096, + "learning_rate": 6.137376310027008e-07, + "loss": 1.5915, + "step": 3463 + }, + { + "epoch": 0.2413684980664042, + "grad_norm": 0.7039807459381068, + "learning_rate": 6.136902151796495e-07, + "loss": 1.5709, + "step": 3464 + }, + { + "epoch": 0.24143817719402152, + "grad_norm": 0.6721860425881251, + "learning_rate": 6.136427883972935e-07, + "loss": 1.5871, + "step": 3465 + }, + { + "epoch": 0.24150785632163885, + "grad_norm": 0.7615669920967632, + "learning_rate": 6.135953506579057e-07, + "loss": 1.5575, + "step": 3466 + }, + { + "epoch": 0.24157753544925617, + "grad_norm": 0.6938093239176405, + "learning_rate": 6.135479019637593e-07, + "loss": 1.5081, + "step": 3467 + }, + { + "epoch": 0.2416472145768735, + "grad_norm": 0.6998087722630385, + "learning_rate": 6.135004423171284e-07, + "loss": 1.475, + "step": 3468 + }, + { + "epoch": 0.24171689370449082, + "grad_norm": 0.7284799755105115, + "learning_rate": 6.134529717202873e-07, + "loss": 1.5256, + "step": 3469 + }, + { + "epoch": 0.24178657283210814, + "grad_norm": 0.7339876042097843, + "learning_rate": 6.134054901755106e-07, + "loss": 1.4791, + "step": 3470 + }, + { + "epoch": 0.24185625195972546, + "grad_norm": 0.7344813016736725, + "learning_rate": 6.133579976850738e-07, + "loss": 1.6821, + "step": 3471 + }, + { + "epoch": 0.24192593108734278, + "grad_norm": 0.8034131484006056, + "learning_rate": 6.133104942512532e-07, + "loss": 1.5536, + "step": 3472 + }, + { + "epoch": 0.2419956102149601, + "grad_norm": 0.7091929112333546, + "learning_rate": 6.132629798763249e-07, + "loss": 1.5343, + "step": 3473 + }, + { + "epoch": 0.24206528934257743, + "grad_norm": 0.7372457302786409, + "learning_rate": 6.13215454562566e-07, + "loss": 1.5681, + "step": 3474 + }, + { + "epoch": 0.24213496847019475, + "grad_norm": 0.7163122501788207, + "learning_rate": 6.131679183122539e-07, + "loss": 1.5721, + "step": 3475 + }, + { + "epoch": 0.24220464759781207, + "grad_norm": 0.7644696332870136, + "learning_rate": 6.131203711276669e-07, + "loss": 1.5467, + "step": 3476 + }, + { + "epoch": 0.2422743267254294, + "grad_norm": 0.659718083288887, + "learning_rate": 6.130728130110833e-07, + "loss": 1.5017, + "step": 3477 + }, + { + "epoch": 0.24234400585304672, + "grad_norm": 0.6883044205305839, + "learning_rate": 6.130252439647823e-07, + "loss": 1.5113, + "step": 3478 + }, + { + "epoch": 0.24241368498066404, + "grad_norm": 0.7201350430252552, + "learning_rate": 6.129776639910434e-07, + "loss": 1.5899, + "step": 3479 + }, + { + "epoch": 0.24248336410828136, + "grad_norm": 0.7132112424660706, + "learning_rate": 6.129300730921468e-07, + "loss": 1.4527, + "step": 3480 + }, + { + "epoch": 0.24255304323589869, + "grad_norm": 0.7118297627685076, + "learning_rate": 6.128824712703734e-07, + "loss": 1.6241, + "step": 3481 + }, + { + "epoch": 0.242622722363516, + "grad_norm": 0.7700354394842043, + "learning_rate": 6.128348585280039e-07, + "loss": 1.5645, + "step": 3482 + }, + { + "epoch": 0.24269240149113333, + "grad_norm": 0.7532057834209641, + "learning_rate": 6.127872348673204e-07, + "loss": 1.5646, + "step": 3483 + }, + { + "epoch": 0.24276208061875065, + "grad_norm": 0.7249802886382719, + "learning_rate": 6.127396002906049e-07, + "loss": 1.4486, + "step": 3484 + }, + { + "epoch": 0.24283175974636798, + "grad_norm": 0.7016431329438683, + "learning_rate": 6.126919548001403e-07, + "loss": 1.6213, + "step": 3485 + }, + { + "epoch": 0.2429014388739853, + "grad_norm": 0.7030641444243805, + "learning_rate": 6.126442983982096e-07, + "loss": 1.5588, + "step": 3486 + }, + { + "epoch": 0.24297111800160262, + "grad_norm": 0.7413791208700851, + "learning_rate": 6.125966310870968e-07, + "loss": 1.6794, + "step": 3487 + }, + { + "epoch": 0.24304079712921994, + "grad_norm": 0.6779401090861621, + "learning_rate": 6.125489528690863e-07, + "loss": 1.6256, + "step": 3488 + }, + { + "epoch": 0.24311047625683727, + "grad_norm": 0.6852961562158302, + "learning_rate": 6.125012637464628e-07, + "loss": 1.578, + "step": 3489 + }, + { + "epoch": 0.2431801553844546, + "grad_norm": 0.7876391800882989, + "learning_rate": 6.124535637215116e-07, + "loss": 1.6865, + "step": 3490 + }, + { + "epoch": 0.2432498345120719, + "grad_norm": 0.7089052160927828, + "learning_rate": 6.124058527965189e-07, + "loss": 1.4728, + "step": 3491 + }, + { + "epoch": 0.24331951363968923, + "grad_norm": 0.6710252479864719, + "learning_rate": 6.123581309737707e-07, + "loss": 1.5328, + "step": 3492 + }, + { + "epoch": 0.24338919276730656, + "grad_norm": 7.417390349939993, + "learning_rate": 6.12310398255554e-07, + "loss": 1.468, + "step": 3493 + }, + { + "epoch": 0.24345887189492388, + "grad_norm": 0.6818307687602854, + "learning_rate": 6.122626546441567e-07, + "loss": 1.6087, + "step": 3494 + }, + { + "epoch": 0.2435285510225412, + "grad_norm": 0.7113519070432572, + "learning_rate": 6.122149001418661e-07, + "loss": 1.5702, + "step": 3495 + }, + { + "epoch": 0.24359823015015852, + "grad_norm": 0.7331933417391607, + "learning_rate": 6.121671347509712e-07, + "loss": 1.6366, + "step": 3496 + }, + { + "epoch": 0.24366790927777585, + "grad_norm": 0.7353912377560792, + "learning_rate": 6.121193584737607e-07, + "loss": 1.4549, + "step": 3497 + }, + { + "epoch": 0.24373758840539317, + "grad_norm": 0.747218662762241, + "learning_rate": 6.120715713125245e-07, + "loss": 1.6776, + "step": 3498 + }, + { + "epoch": 0.2438072675330105, + "grad_norm": 0.7783135396187291, + "learning_rate": 6.120237732695521e-07, + "loss": 1.5549, + "step": 3499 + }, + { + "epoch": 0.2438769466606278, + "grad_norm": 0.7866557794969244, + "learning_rate": 6.119759643471347e-07, + "loss": 1.3877, + "step": 3500 + }, + { + "epoch": 0.24394662578824514, + "grad_norm": 0.7104409997433944, + "learning_rate": 6.11928144547563e-07, + "loss": 1.5761, + "step": 3501 + }, + { + "epoch": 0.24401630491586246, + "grad_norm": 0.7434088447922035, + "learning_rate": 6.118803138731287e-07, + "loss": 1.513, + "step": 3502 + }, + { + "epoch": 0.24408598404347978, + "grad_norm": 0.719581279474016, + "learning_rate": 6.118324723261241e-07, + "loss": 1.4972, + "step": 3503 + }, + { + "epoch": 0.2441556631710971, + "grad_norm": 0.643472580546137, + "learning_rate": 6.117846199088417e-07, + "loss": 1.4641, + "step": 3504 + }, + { + "epoch": 0.24422534229871443, + "grad_norm": 0.7445412858204775, + "learning_rate": 6.117367566235748e-07, + "loss": 1.5576, + "step": 3505 + }, + { + "epoch": 0.24429502142633175, + "grad_norm": 0.7239385537409243, + "learning_rate": 6.11688882472617e-07, + "loss": 1.5952, + "step": 3506 + }, + { + "epoch": 0.24436470055394907, + "grad_norm": 0.7764917647022519, + "learning_rate": 6.116409974582625e-07, + "loss": 1.6206, + "step": 3507 + }, + { + "epoch": 0.2444343796815664, + "grad_norm": 0.74202044266512, + "learning_rate": 6.115931015828062e-07, + "loss": 1.5645, + "step": 3508 + }, + { + "epoch": 0.24450405880918372, + "grad_norm": 0.6935767282043237, + "learning_rate": 6.115451948485431e-07, + "loss": 1.5034, + "step": 3509 + }, + { + "epoch": 0.24457373793680104, + "grad_norm": 0.7039628910097253, + "learning_rate": 6.114972772577693e-07, + "loss": 1.5239, + "step": 3510 + }, + { + "epoch": 0.24464341706441836, + "grad_norm": 0.6874285293399682, + "learning_rate": 6.11449348812781e-07, + "loss": 1.6456, + "step": 3511 + }, + { + "epoch": 0.24471309619203568, + "grad_norm": 0.6522379381976608, + "learning_rate": 6.11401409515875e-07, + "loss": 1.5646, + "step": 3512 + }, + { + "epoch": 0.244782775319653, + "grad_norm": 0.8171749552425288, + "learning_rate": 6.113534593693486e-07, + "loss": 1.5602, + "step": 3513 + }, + { + "epoch": 0.24485245444727033, + "grad_norm": 0.7396199021021814, + "learning_rate": 6.113054983754999e-07, + "loss": 1.5303, + "step": 3514 + }, + { + "epoch": 0.24492213357488765, + "grad_norm": 0.7662564325697268, + "learning_rate": 6.11257526536627e-07, + "loss": 1.5687, + "step": 3515 + }, + { + "epoch": 0.24499181270250497, + "grad_norm": 0.6908179397521864, + "learning_rate": 6.11209543855029e-07, + "loss": 1.5637, + "step": 3516 + }, + { + "epoch": 0.2450614918301223, + "grad_norm": 0.6836038980227871, + "learning_rate": 6.111615503330051e-07, + "loss": 1.5156, + "step": 3517 + }, + { + "epoch": 0.24513117095773962, + "grad_norm": 0.73581436271446, + "learning_rate": 6.111135459728556e-07, + "loss": 1.6565, + "step": 3518 + }, + { + "epoch": 0.24520085008535694, + "grad_norm": 0.7405195856434742, + "learning_rate": 6.110655307768808e-07, + "loss": 1.4523, + "step": 3519 + }, + { + "epoch": 0.24527052921297426, + "grad_norm": 0.7585536397564181, + "learning_rate": 6.110175047473816e-07, + "loss": 1.3932, + "step": 3520 + }, + { + "epoch": 0.2453402083405916, + "grad_norm": 0.709276744464445, + "learning_rate": 6.109694678866594e-07, + "loss": 1.4675, + "step": 3521 + }, + { + "epoch": 0.2454098874682089, + "grad_norm": 0.750704891947952, + "learning_rate": 6.109214201970165e-07, + "loss": 1.4458, + "step": 3522 + }, + { + "epoch": 0.24547956659582623, + "grad_norm": 0.772678393930129, + "learning_rate": 6.108733616807554e-07, + "loss": 1.6841, + "step": 3523 + }, + { + "epoch": 0.24554924572344355, + "grad_norm": 0.6817202500029117, + "learning_rate": 6.10825292340179e-07, + "loss": 1.5587, + "step": 3524 + }, + { + "epoch": 0.24561892485106088, + "grad_norm": 0.8040107414482522, + "learning_rate": 6.10777212177591e-07, + "loss": 1.5774, + "step": 3525 + }, + { + "epoch": 0.2456886039786782, + "grad_norm": 1.796804413019553, + "learning_rate": 6.107291211952956e-07, + "loss": 1.4213, + "step": 3526 + }, + { + "epoch": 0.24575828310629552, + "grad_norm": 0.7693183737224776, + "learning_rate": 6.106810193955972e-07, + "loss": 1.6151, + "step": 3527 + }, + { + "epoch": 0.24582796223391284, + "grad_norm": 0.7251457431661789, + "learning_rate": 6.10632906780801e-07, + "loss": 1.5434, + "step": 3528 + }, + { + "epoch": 0.24589764136153017, + "grad_norm": 0.7261229116118169, + "learning_rate": 6.105847833532127e-07, + "loss": 1.4882, + "step": 3529 + }, + { + "epoch": 0.2459673204891475, + "grad_norm": 0.7457010223359175, + "learning_rate": 6.105366491151387e-07, + "loss": 1.7156, + "step": 3530 + }, + { + "epoch": 0.24603699961676478, + "grad_norm": 0.7071322583151507, + "learning_rate": 6.104885040688851e-07, + "loss": 1.5663, + "step": 3531 + }, + { + "epoch": 0.2461066787443821, + "grad_norm": 0.6879911743671988, + "learning_rate": 6.104403482167596e-07, + "loss": 1.5783, + "step": 3532 + }, + { + "epoch": 0.24617635787199943, + "grad_norm": 0.744748622425573, + "learning_rate": 6.103921815610699e-07, + "loss": 1.6289, + "step": 3533 + }, + { + "epoch": 0.24624603699961675, + "grad_norm": 0.691139277317366, + "learning_rate": 6.10344004104124e-07, + "loss": 1.4919, + "step": 3534 + }, + { + "epoch": 0.24631571612723407, + "grad_norm": 0.6674381484094699, + "learning_rate": 6.102958158482309e-07, + "loss": 1.344, + "step": 3535 + }, + { + "epoch": 0.2463853952548514, + "grad_norm": 0.7468365533757317, + "learning_rate": 6.102476167956997e-07, + "loss": 1.5975, + "step": 3536 + }, + { + "epoch": 0.24645507438246872, + "grad_norm": 0.7617737698264097, + "learning_rate": 6.101994069488403e-07, + "loss": 1.6178, + "step": 3537 + }, + { + "epoch": 0.24652475351008604, + "grad_norm": 0.6924214940752028, + "learning_rate": 6.10151186309963e-07, + "loss": 1.5002, + "step": 3538 + }, + { + "epoch": 0.24659443263770336, + "grad_norm": 0.7349604377014846, + "learning_rate": 6.101029548813787e-07, + "loss": 1.6046, + "step": 3539 + }, + { + "epoch": 0.2466641117653207, + "grad_norm": 0.7471521985422922, + "learning_rate": 6.100547126653986e-07, + "loss": 1.5667, + "step": 3540 + }, + { + "epoch": 0.246733790892938, + "grad_norm": 0.7367598222463619, + "learning_rate": 6.100064596643346e-07, + "loss": 1.6448, + "step": 3541 + }, + { + "epoch": 0.24680347002055533, + "grad_norm": 0.7776711532694474, + "learning_rate": 6.099581958804993e-07, + "loss": 1.4532, + "step": 3542 + }, + { + "epoch": 0.24687314914817265, + "grad_norm": 0.7806167018788672, + "learning_rate": 6.099099213162053e-07, + "loss": 1.638, + "step": 3543 + }, + { + "epoch": 0.24694282827578998, + "grad_norm": 0.7661398733958934, + "learning_rate": 6.098616359737661e-07, + "loss": 1.6671, + "step": 3544 + }, + { + "epoch": 0.2470125074034073, + "grad_norm": 0.7234253141818819, + "learning_rate": 6.098133398554956e-07, + "loss": 1.524, + "step": 3545 + }, + { + "epoch": 0.24708218653102462, + "grad_norm": 0.7359219209086516, + "learning_rate": 6.097650329637085e-07, + "loss": 1.6014, + "step": 3546 + }, + { + "epoch": 0.24715186565864194, + "grad_norm": 0.681303466428281, + "learning_rate": 6.097167153007195e-07, + "loss": 1.6173, + "step": 3547 + }, + { + "epoch": 0.24722154478625927, + "grad_norm": 0.6865820922181248, + "learning_rate": 6.096683868688443e-07, + "loss": 1.5139, + "step": 3548 + }, + { + "epoch": 0.2472912239138766, + "grad_norm": 0.6920372454919099, + "learning_rate": 6.096200476703986e-07, + "loss": 1.3954, + "step": 3549 + }, + { + "epoch": 0.2473609030414939, + "grad_norm": 0.7476041916829527, + "learning_rate": 6.095716977076992e-07, + "loss": 1.3092, + "step": 3550 + }, + { + "epoch": 0.24743058216911124, + "grad_norm": 0.8072680023419683, + "learning_rate": 6.095233369830628e-07, + "loss": 1.5462, + "step": 3551 + }, + { + "epoch": 0.24750026129672856, + "grad_norm": 0.8213723342959321, + "learning_rate": 6.094749654988073e-07, + "loss": 1.7721, + "step": 3552 + }, + { + "epoch": 0.24756994042434588, + "grad_norm": 0.6873792435636162, + "learning_rate": 6.094265832572506e-07, + "loss": 1.5889, + "step": 3553 + }, + { + "epoch": 0.2476396195519632, + "grad_norm": 0.7316708923206956, + "learning_rate": 6.093781902607114e-07, + "loss": 1.523, + "step": 3554 + }, + { + "epoch": 0.24770929867958053, + "grad_norm": 0.7469485247831148, + "learning_rate": 6.093297865115086e-07, + "loss": 1.6226, + "step": 3555 + }, + { + "epoch": 0.24777897780719785, + "grad_norm": 0.6725940287268622, + "learning_rate": 6.092813720119618e-07, + "loss": 1.435, + "step": 3556 + }, + { + "epoch": 0.24784865693481517, + "grad_norm": 0.6954422705334324, + "learning_rate": 6.092329467643914e-07, + "loss": 1.4995, + "step": 3557 + }, + { + "epoch": 0.2479183360624325, + "grad_norm": 0.7117543686613589, + "learning_rate": 6.091845107711177e-07, + "loss": 1.384, + "step": 3558 + }, + { + "epoch": 0.24798801519004982, + "grad_norm": 0.7432403113648027, + "learning_rate": 6.091360640344619e-07, + "loss": 1.6325, + "step": 3559 + }, + { + "epoch": 0.24805769431766714, + "grad_norm": 0.7451833768308808, + "learning_rate": 6.09087606556746e-07, + "loss": 1.6012, + "step": 3560 + }, + { + "epoch": 0.24812737344528446, + "grad_norm": 1.4392707418778636, + "learning_rate": 6.090391383402919e-07, + "loss": 1.5517, + "step": 3561 + }, + { + "epoch": 0.24819705257290178, + "grad_norm": 0.726831617640144, + "learning_rate": 6.089906593874222e-07, + "loss": 1.492, + "step": 3562 + }, + { + "epoch": 0.2482667317005191, + "grad_norm": 0.768420711031291, + "learning_rate": 6.089421697004604e-07, + "loss": 1.6141, + "step": 3563 + }, + { + "epoch": 0.24833641082813643, + "grad_norm": 0.7361593118586319, + "learning_rate": 6.088936692817301e-07, + "loss": 1.7136, + "step": 3564 + }, + { + "epoch": 0.24840608995575375, + "grad_norm": 0.7033722393985278, + "learning_rate": 6.088451581335555e-07, + "loss": 1.4913, + "step": 3565 + }, + { + "epoch": 0.24847576908337107, + "grad_norm": 0.7232376051382627, + "learning_rate": 6.087966362582614e-07, + "loss": 1.5684, + "step": 3566 + }, + { + "epoch": 0.2485454482109884, + "grad_norm": 0.7350353882547026, + "learning_rate": 6.087481036581729e-07, + "loss": 1.54, + "step": 3567 + }, + { + "epoch": 0.24861512733860572, + "grad_norm": 0.7768085786334259, + "learning_rate": 6.08699560335616e-07, + "loss": 1.6108, + "step": 3568 + }, + { + "epoch": 0.24868480646622304, + "grad_norm": 0.7023493785204465, + "learning_rate": 6.08651006292917e-07, + "loss": 1.4761, + "step": 3569 + }, + { + "epoch": 0.24875448559384036, + "grad_norm": 0.699437447824012, + "learning_rate": 6.086024415324025e-07, + "loss": 1.5039, + "step": 3570 + }, + { + "epoch": 0.24882416472145769, + "grad_norm": 0.7063756052529659, + "learning_rate": 6.085538660564001e-07, + "loss": 1.4976, + "step": 3571 + }, + { + "epoch": 0.248893843849075, + "grad_norm": 0.7142139878054212, + "learning_rate": 6.085052798672374e-07, + "loss": 1.3809, + "step": 3572 + }, + { + "epoch": 0.24896352297669233, + "grad_norm": 0.6959967770546999, + "learning_rate": 6.084566829672429e-07, + "loss": 1.5112, + "step": 3573 + }, + { + "epoch": 0.24903320210430965, + "grad_norm": 0.6939186177106823, + "learning_rate": 6.084080753587453e-07, + "loss": 1.5446, + "step": 3574 + }, + { + "epoch": 0.24910288123192698, + "grad_norm": 0.7029487486994859, + "learning_rate": 6.083594570440742e-07, + "loss": 1.4871, + "step": 3575 + }, + { + "epoch": 0.2491725603595443, + "grad_norm": 0.742816701423201, + "learning_rate": 6.083108280255593e-07, + "loss": 1.6708, + "step": 3576 + }, + { + "epoch": 0.24924223948716162, + "grad_norm": 0.7137477992334339, + "learning_rate": 6.08262188305531e-07, + "loss": 1.431, + "step": 3577 + }, + { + "epoch": 0.24931191861477894, + "grad_norm": 0.7036883536953713, + "learning_rate": 6.082135378863204e-07, + "loss": 1.5361, + "step": 3578 + }, + { + "epoch": 0.24938159774239627, + "grad_norm": 0.807427431662475, + "learning_rate": 6.081648767702589e-07, + "loss": 1.6449, + "step": 3579 + }, + { + "epoch": 0.2494512768700136, + "grad_norm": 0.6729778495040805, + "learning_rate": 6.081162049596781e-07, + "loss": 1.5132, + "step": 3580 + }, + { + "epoch": 0.2495209559976309, + "grad_norm": 0.6724953431528643, + "learning_rate": 6.080675224569108e-07, + "loss": 1.4788, + "step": 3581 + }, + { + "epoch": 0.24959063512524823, + "grad_norm": 0.7154576876166014, + "learning_rate": 6.080188292642901e-07, + "loss": 1.5311, + "step": 3582 + }, + { + "epoch": 0.24966031425286556, + "grad_norm": 0.7493444527994712, + "learning_rate": 6.07970125384149e-07, + "loss": 1.5591, + "step": 3583 + }, + { + "epoch": 0.24972999338048288, + "grad_norm": 0.7266179891707369, + "learning_rate": 6.079214108188219e-07, + "loss": 1.5179, + "step": 3584 + }, + { + "epoch": 0.2497996725081002, + "grad_norm": 0.7217565810210871, + "learning_rate": 6.07872685570643e-07, + "loss": 1.5389, + "step": 3585 + }, + { + "epoch": 0.24986935163571752, + "grad_norm": 0.6896161569382101, + "learning_rate": 6.078239496419476e-07, + "loss": 1.5998, + "step": 3586 + }, + { + "epoch": 0.24993903076333485, + "grad_norm": 0.7371361263420952, + "learning_rate": 6.07775203035071e-07, + "loss": 1.5602, + "step": 3587 + }, + { + "epoch": 0.25000870989095214, + "grad_norm": 0.7334616155478288, + "learning_rate": 6.077264457523493e-07, + "loss": 1.5357, + "step": 3588 + }, + { + "epoch": 0.2500783890185695, + "grad_norm": 0.6977451408653291, + "learning_rate": 6.076776777961192e-07, + "loss": 1.7069, + "step": 3589 + }, + { + "epoch": 0.2501480681461868, + "grad_norm": 0.6836085763879641, + "learning_rate": 6.076288991687174e-07, + "loss": 1.5012, + "step": 3590 + }, + { + "epoch": 0.25021774727380414, + "grad_norm": 0.724668401614909, + "learning_rate": 6.075801098724819e-07, + "loss": 1.4189, + "step": 3591 + }, + { + "epoch": 0.25028742640142143, + "grad_norm": 0.7218857976482577, + "learning_rate": 6.075313099097505e-07, + "loss": 1.533, + "step": 3592 + }, + { + "epoch": 0.2503571055290388, + "grad_norm": 0.7454390129326977, + "learning_rate": 6.07482499282862e-07, + "loss": 1.6404, + "step": 3593 + }, + { + "epoch": 0.2504267846566561, + "grad_norm": 0.7276507976619264, + "learning_rate": 6.074336779941551e-07, + "loss": 1.5575, + "step": 3594 + }, + { + "epoch": 0.2504964637842734, + "grad_norm": 0.8181538349091576, + "learning_rate": 6.0738484604597e-07, + "loss": 1.5305, + "step": 3595 + }, + { + "epoch": 0.2505661429118907, + "grad_norm": 0.7022901688141522, + "learning_rate": 6.073360034406465e-07, + "loss": 1.4621, + "step": 3596 + }, + { + "epoch": 0.25063582203950807, + "grad_norm": 0.7053344078146021, + "learning_rate": 6.072871501805251e-07, + "loss": 1.6526, + "step": 3597 + }, + { + "epoch": 0.25070550116712537, + "grad_norm": 0.6812712012110201, + "learning_rate": 6.072382862679472e-07, + "loss": 1.4954, + "step": 3598 + }, + { + "epoch": 0.2507751802947427, + "grad_norm": 0.6865562783180164, + "learning_rate": 6.071894117052545e-07, + "loss": 1.5444, + "step": 3599 + }, + { + "epoch": 0.25084485942236, + "grad_norm": 0.7130584509089323, + "learning_rate": 6.071405264947889e-07, + "loss": 1.5434, + "step": 3600 + }, + { + "epoch": 0.25091453854997736, + "grad_norm": 0.7343554727384859, + "learning_rate": 6.070916306388933e-07, + "loss": 1.4086, + "step": 3601 + }, + { + "epoch": 0.25098421767759466, + "grad_norm": 0.7308227185668241, + "learning_rate": 6.070427241399108e-07, + "loss": 1.533, + "step": 3602 + }, + { + "epoch": 0.251053896805212, + "grad_norm": 0.7054271106104274, + "learning_rate": 6.06993807000185e-07, + "loss": 1.5204, + "step": 3603 + }, + { + "epoch": 0.2511235759328293, + "grad_norm": 0.7238952984007729, + "learning_rate": 6.069448792220603e-07, + "loss": 1.5857, + "step": 3604 + }, + { + "epoch": 0.25119325506044665, + "grad_norm": 0.7619163189051473, + "learning_rate": 6.068959408078813e-07, + "loss": 1.6456, + "step": 3605 + }, + { + "epoch": 0.25126293418806395, + "grad_norm": 0.7691821142914758, + "learning_rate": 6.068469917599934e-07, + "loss": 1.6207, + "step": 3606 + }, + { + "epoch": 0.2513326133156813, + "grad_norm": 0.7844297210575869, + "learning_rate": 6.067980320807421e-07, + "loss": 1.6005, + "step": 3607 + }, + { + "epoch": 0.2514022924432986, + "grad_norm": 0.767312581828994, + "learning_rate": 6.067490617724739e-07, + "loss": 1.5474, + "step": 3608 + }, + { + "epoch": 0.25147197157091594, + "grad_norm": 0.7466302509493756, + "learning_rate": 6.067000808375353e-07, + "loss": 1.5245, + "step": 3609 + }, + { + "epoch": 0.25154165069853324, + "grad_norm": 0.7194713716465715, + "learning_rate": 6.066510892782737e-07, + "loss": 1.5255, + "step": 3610 + }, + { + "epoch": 0.2516113298261506, + "grad_norm": 0.6903691577711621, + "learning_rate": 6.066020870970368e-07, + "loss": 1.5452, + "step": 3611 + }, + { + "epoch": 0.2516810089537679, + "grad_norm": 0.8469633001258798, + "learning_rate": 6.065530742961731e-07, + "loss": 1.5305, + "step": 3612 + }, + { + "epoch": 0.25175068808138523, + "grad_norm": 0.6957640057146053, + "learning_rate": 6.065040508780312e-07, + "loss": 1.4241, + "step": 3613 + }, + { + "epoch": 0.2518203672090025, + "grad_norm": 0.7070417869184299, + "learning_rate": 6.064550168449603e-07, + "loss": 1.5566, + "step": 3614 + }, + { + "epoch": 0.2518900463366199, + "grad_norm": 0.7573782379837182, + "learning_rate": 6.064059721993104e-07, + "loss": 1.5595, + "step": 3615 + }, + { + "epoch": 0.25195972546423717, + "grad_norm": 0.7678449791648668, + "learning_rate": 6.063569169434319e-07, + "loss": 1.5895, + "step": 3616 + }, + { + "epoch": 0.2520294045918545, + "grad_norm": 0.7780040963314879, + "learning_rate": 6.063078510796754e-07, + "loss": 1.5685, + "step": 3617 + }, + { + "epoch": 0.2520990837194718, + "grad_norm": 0.697366601001487, + "learning_rate": 6.062587746103924e-07, + "loss": 1.6299, + "step": 3618 + }, + { + "epoch": 0.25216876284708917, + "grad_norm": 0.6918393842439775, + "learning_rate": 6.062096875379346e-07, + "loss": 1.6065, + "step": 3619 + }, + { + "epoch": 0.25223844197470646, + "grad_norm": 0.7765055919439487, + "learning_rate": 6.061605898646545e-07, + "loss": 1.6526, + "step": 3620 + }, + { + "epoch": 0.2523081211023238, + "grad_norm": 0.7472761617400331, + "learning_rate": 6.06111481592905e-07, + "loss": 1.542, + "step": 3621 + }, + { + "epoch": 0.2523778002299411, + "grad_norm": 0.7486342805260572, + "learning_rate": 6.060623627250391e-07, + "loss": 1.6035, + "step": 3622 + }, + { + "epoch": 0.25244747935755846, + "grad_norm": 0.7500589843074995, + "learning_rate": 6.060132332634111e-07, + "loss": 1.5478, + "step": 3623 + }, + { + "epoch": 0.25251715848517575, + "grad_norm": 0.7331824331258279, + "learning_rate": 6.059640932103753e-07, + "loss": 1.4816, + "step": 3624 + }, + { + "epoch": 0.2525868376127931, + "grad_norm": 0.6785392683089051, + "learning_rate": 6.059149425682865e-07, + "loss": 1.4475, + "step": 3625 + }, + { + "epoch": 0.2526565167404104, + "grad_norm": 0.7152394572979834, + "learning_rate": 6.058657813395e-07, + "loss": 1.6863, + "step": 3626 + }, + { + "epoch": 0.25272619586802775, + "grad_norm": 0.7231210956234625, + "learning_rate": 6.058166095263719e-07, + "loss": 1.583, + "step": 3627 + }, + { + "epoch": 0.25279587499564504, + "grad_norm": 0.7245911280542969, + "learning_rate": 6.057674271312585e-07, + "loss": 1.64, + "step": 3628 + }, + { + "epoch": 0.2528655541232624, + "grad_norm": 0.7097081774448422, + "learning_rate": 6.057182341565169e-07, + "loss": 1.5683, + "step": 3629 + }, + { + "epoch": 0.2529352332508797, + "grad_norm": 0.7191293124979857, + "learning_rate": 6.056690306045043e-07, + "loss": 1.5606, + "step": 3630 + }, + { + "epoch": 0.25300491237849704, + "grad_norm": 0.7614871453223097, + "learning_rate": 6.056198164775786e-07, + "loss": 1.5594, + "step": 3631 + }, + { + "epoch": 0.25307459150611433, + "grad_norm": 0.8002734672962935, + "learning_rate": 6.055705917780987e-07, + "loss": 1.5608, + "step": 3632 + }, + { + "epoch": 0.2531442706337317, + "grad_norm": 0.7754927308854186, + "learning_rate": 6.055213565084229e-07, + "loss": 1.678, + "step": 3633 + }, + { + "epoch": 0.253213949761349, + "grad_norm": 0.832975102500679, + "learning_rate": 6.054721106709111e-07, + "loss": 1.5249, + "step": 3634 + }, + { + "epoch": 0.25328362888896633, + "grad_norm": 0.697784227911405, + "learning_rate": 6.054228542679231e-07, + "loss": 1.5183, + "step": 3635 + }, + { + "epoch": 0.2533533080165836, + "grad_norm": 0.7001080333414058, + "learning_rate": 6.053735873018195e-07, + "loss": 1.5209, + "step": 3636 + }, + { + "epoch": 0.253422987144201, + "grad_norm": 0.6923807006499639, + "learning_rate": 6.053243097749611e-07, + "loss": 1.5297, + "step": 3637 + }, + { + "epoch": 0.25349266627181827, + "grad_norm": 0.8379293298094277, + "learning_rate": 6.052750216897095e-07, + "loss": 1.5739, + "step": 3638 + }, + { + "epoch": 0.2535623453994356, + "grad_norm": 0.6882683163202826, + "learning_rate": 6.052257230484266e-07, + "loss": 1.4844, + "step": 3639 + }, + { + "epoch": 0.2536320245270529, + "grad_norm": 0.7033906157479533, + "learning_rate": 6.05176413853475e-07, + "loss": 1.435, + "step": 3640 + }, + { + "epoch": 0.25370170365467026, + "grad_norm": 0.6733432624515764, + "learning_rate": 6.051270941072176e-07, + "loss": 1.3151, + "step": 3641 + }, + { + "epoch": 0.25377138278228756, + "grad_norm": 0.6875760437295031, + "learning_rate": 6.050777638120179e-07, + "loss": 1.5586, + "step": 3642 + }, + { + "epoch": 0.2538410619099049, + "grad_norm": 0.7383449477336569, + "learning_rate": 6.050284229702399e-07, + "loss": 1.468, + "step": 3643 + }, + { + "epoch": 0.2539107410375222, + "grad_norm": 0.7189865672271544, + "learning_rate": 6.049790715842483e-07, + "loss": 1.5838, + "step": 3644 + }, + { + "epoch": 0.25398042016513955, + "grad_norm": 0.6813274745603186, + "learning_rate": 6.04929709656408e-07, + "loss": 1.4486, + "step": 3645 + }, + { + "epoch": 0.25405009929275685, + "grad_norm": 0.7344695622304153, + "learning_rate": 6.048803371890844e-07, + "loss": 1.499, + "step": 3646 + }, + { + "epoch": 0.2541197784203742, + "grad_norm": 0.7489876891373174, + "learning_rate": 6.048309541846436e-07, + "loss": 1.573, + "step": 3647 + }, + { + "epoch": 0.2541894575479915, + "grad_norm": 0.6704671719396704, + "learning_rate": 6.047815606454523e-07, + "loss": 1.4899, + "step": 3648 + }, + { + "epoch": 0.25425913667560884, + "grad_norm": 0.7009858120136003, + "learning_rate": 6.047321565738773e-07, + "loss": 1.5259, + "step": 3649 + }, + { + "epoch": 0.25432881580322614, + "grad_norm": 0.6893590317823148, + "learning_rate": 6.046827419722863e-07, + "loss": 1.5622, + "step": 3650 + }, + { + "epoch": 0.2543984949308435, + "grad_norm": 0.7623085036945566, + "learning_rate": 6.046333168430474e-07, + "loss": 1.7747, + "step": 3651 + }, + { + "epoch": 0.2544681740584608, + "grad_norm": 0.6939366234602085, + "learning_rate": 6.045838811885289e-07, + "loss": 1.6092, + "step": 3652 + }, + { + "epoch": 0.25453785318607813, + "grad_norm": 0.7396889519355708, + "learning_rate": 6.045344350111001e-07, + "loss": 1.5682, + "step": 3653 + }, + { + "epoch": 0.25460753231369543, + "grad_norm": 0.7759652144680148, + "learning_rate": 6.044849783131304e-07, + "loss": 1.6867, + "step": 3654 + }, + { + "epoch": 0.2546772114413128, + "grad_norm": 0.6993831801702053, + "learning_rate": 6.044355110969901e-07, + "loss": 1.5669, + "step": 3655 + }, + { + "epoch": 0.2547468905689301, + "grad_norm": 0.738306542563742, + "learning_rate": 6.043860333650495e-07, + "loss": 1.4476, + "step": 3656 + }, + { + "epoch": 0.2548165696965474, + "grad_norm": 0.6945902157434075, + "learning_rate": 6.043365451196799e-07, + "loss": 1.5581, + "step": 3657 + }, + { + "epoch": 0.2548862488241647, + "grad_norm": 0.7798903180473131, + "learning_rate": 6.042870463632525e-07, + "loss": 1.5781, + "step": 3658 + }, + { + "epoch": 0.25495592795178207, + "grad_norm": 0.6992002259989656, + "learning_rate": 6.042375370981399e-07, + "loss": 1.5645, + "step": 3659 + }, + { + "epoch": 0.25502560707939936, + "grad_norm": 0.6678133223554386, + "learning_rate": 6.041880173267144e-07, + "loss": 1.4591, + "step": 3660 + }, + { + "epoch": 0.2550952862070167, + "grad_norm": 0.6941086204495733, + "learning_rate": 6.041384870513491e-07, + "loss": 1.5182, + "step": 3661 + }, + { + "epoch": 0.255164965334634, + "grad_norm": 0.7269003400483975, + "learning_rate": 6.040889462744175e-07, + "loss": 1.4942, + "step": 3662 + }, + { + "epoch": 0.25523464446225136, + "grad_norm": 0.6982954121554564, + "learning_rate": 6.04039394998294e-07, + "loss": 1.5595, + "step": 3663 + }, + { + "epoch": 0.25530432358986865, + "grad_norm": 0.7186993415671693, + "learning_rate": 6.039898332253529e-07, + "loss": 1.4928, + "step": 3664 + }, + { + "epoch": 0.255374002717486, + "grad_norm": 0.7386453946777918, + "learning_rate": 6.039402609579695e-07, + "loss": 1.514, + "step": 3665 + }, + { + "epoch": 0.2554436818451033, + "grad_norm": 0.7891239561661717, + "learning_rate": 6.038906781985192e-07, + "loss": 1.5981, + "step": 3666 + }, + { + "epoch": 0.25551336097272065, + "grad_norm": 0.7333305917018511, + "learning_rate": 6.038410849493784e-07, + "loss": 1.6627, + "step": 3667 + }, + { + "epoch": 0.25558304010033794, + "grad_norm": 0.7238548352809335, + "learning_rate": 6.037914812129233e-07, + "loss": 1.5441, + "step": 3668 + }, + { + "epoch": 0.25565271922795524, + "grad_norm": 0.7443351369453468, + "learning_rate": 6.037418669915314e-07, + "loss": 1.5688, + "step": 3669 + }, + { + "epoch": 0.2557223983555726, + "grad_norm": 0.766995135505678, + "learning_rate": 6.036922422875802e-07, + "loss": 1.6102, + "step": 3670 + }, + { + "epoch": 0.2557920774831899, + "grad_norm": 0.7051706694725376, + "learning_rate": 6.036426071034477e-07, + "loss": 1.6172, + "step": 3671 + }, + { + "epoch": 0.25586175661080723, + "grad_norm": 0.7193831318040109, + "learning_rate": 6.035929614415127e-07, + "loss": 1.4728, + "step": 3672 + }, + { + "epoch": 0.25593143573842453, + "grad_norm": 0.6910373279699918, + "learning_rate": 6.035433053041542e-07, + "loss": 1.4988, + "step": 3673 + }, + { + "epoch": 0.2560011148660419, + "grad_norm": 0.6740834430678302, + "learning_rate": 6.034936386937517e-07, + "loss": 1.4256, + "step": 3674 + }, + { + "epoch": 0.2560707939936592, + "grad_norm": 0.7120350209786269, + "learning_rate": 6.034439616126855e-07, + "loss": 1.4566, + "step": 3675 + }, + { + "epoch": 0.2561404731212765, + "grad_norm": 0.7113209360044545, + "learning_rate": 6.033942740633364e-07, + "loss": 1.5773, + "step": 3676 + }, + { + "epoch": 0.2562101522488938, + "grad_norm": 0.7035221439888028, + "learning_rate": 6.033445760480852e-07, + "loss": 1.5885, + "step": 3677 + }, + { + "epoch": 0.25627983137651117, + "grad_norm": 0.7274637005286824, + "learning_rate": 6.032948675693137e-07, + "loss": 1.6685, + "step": 3678 + }, + { + "epoch": 0.25634951050412846, + "grad_norm": 0.7882564844836323, + "learning_rate": 6.03245148629404e-07, + "loss": 1.5647, + "step": 3679 + }, + { + "epoch": 0.2564191896317458, + "grad_norm": 0.7194757461164577, + "learning_rate": 6.031954192307387e-07, + "loss": 1.6198, + "step": 3680 + }, + { + "epoch": 0.2564888687593631, + "grad_norm": 0.7119046320308019, + "learning_rate": 6.031456793757009e-07, + "loss": 1.5605, + "step": 3681 + }, + { + "epoch": 0.25655854788698046, + "grad_norm": 0.7037393192757397, + "learning_rate": 6.030959290666744e-07, + "loss": 1.5774, + "step": 3682 + }, + { + "epoch": 0.25662822701459775, + "grad_norm": 0.7657401184144824, + "learning_rate": 6.030461683060431e-07, + "loss": 1.5446, + "step": 3683 + }, + { + "epoch": 0.2566979061422151, + "grad_norm": 0.7131206536769255, + "learning_rate": 6.02996397096192e-07, + "loss": 1.5325, + "step": 3684 + }, + { + "epoch": 0.2567675852698324, + "grad_norm": 0.9433307003881891, + "learning_rate": 6.029466154395059e-07, + "loss": 1.5525, + "step": 3685 + }, + { + "epoch": 0.25683726439744975, + "grad_norm": 0.7386253299710522, + "learning_rate": 6.028968233383705e-07, + "loss": 1.4944, + "step": 3686 + }, + { + "epoch": 0.25690694352506704, + "grad_norm": 0.7490420571721698, + "learning_rate": 6.028470207951719e-07, + "loss": 1.5285, + "step": 3687 + }, + { + "epoch": 0.2569766226526844, + "grad_norm": 0.7557014064188572, + "learning_rate": 6.027972078122972e-07, + "loss": 1.6712, + "step": 3688 + }, + { + "epoch": 0.2570463017803017, + "grad_norm": 0.7693733995532998, + "learning_rate": 6.027473843921329e-07, + "loss": 1.6772, + "step": 3689 + }, + { + "epoch": 0.25711598090791904, + "grad_norm": 0.7186790659646887, + "learning_rate": 6.026975505370669e-07, + "loss": 1.5178, + "step": 3690 + }, + { + "epoch": 0.25718566003553633, + "grad_norm": 0.7160997653437787, + "learning_rate": 6.026477062494874e-07, + "loss": 1.4966, + "step": 3691 + }, + { + "epoch": 0.2572553391631537, + "grad_norm": 0.758441640620066, + "learning_rate": 6.025978515317831e-07, + "loss": 1.6479, + "step": 3692 + }, + { + "epoch": 0.257325018290771, + "grad_norm": 0.7598611067484194, + "learning_rate": 6.025479863863429e-07, + "loss": 1.6224, + "step": 3693 + }, + { + "epoch": 0.25739469741838833, + "grad_norm": 0.7295528775548727, + "learning_rate": 6.024981108155564e-07, + "loss": 1.6253, + "step": 3694 + }, + { + "epoch": 0.2574643765460056, + "grad_norm": 0.8061244876510675, + "learning_rate": 6.024482248218143e-07, + "loss": 1.5635, + "step": 3695 + }, + { + "epoch": 0.257534055673623, + "grad_norm": 0.7410224621117784, + "learning_rate": 6.023983284075067e-07, + "loss": 1.5478, + "step": 3696 + }, + { + "epoch": 0.25760373480124027, + "grad_norm": 0.784543350091465, + "learning_rate": 6.023484215750249e-07, + "loss": 1.498, + "step": 3697 + }, + { + "epoch": 0.2576734139288576, + "grad_norm": 0.7045651266569742, + "learning_rate": 6.022985043267604e-07, + "loss": 1.4791, + "step": 3698 + }, + { + "epoch": 0.2577430930564749, + "grad_norm": 0.6606350336266129, + "learning_rate": 6.022485766651056e-07, + "loss": 1.54, + "step": 3699 + }, + { + "epoch": 0.25781277218409226, + "grad_norm": 0.707716525899006, + "learning_rate": 6.02198638592453e-07, + "loss": 1.4594, + "step": 3700 + }, + { + "epoch": 0.25788245131170956, + "grad_norm": 0.7500573443667033, + "learning_rate": 6.021486901111957e-07, + "loss": 1.6371, + "step": 3701 + }, + { + "epoch": 0.2579521304393269, + "grad_norm": 0.7185889496885267, + "learning_rate": 6.020987312237273e-07, + "loss": 1.5289, + "step": 3702 + }, + { + "epoch": 0.2580218095669442, + "grad_norm": 0.7717476468655174, + "learning_rate": 6.020487619324421e-07, + "loss": 1.5658, + "step": 3703 + }, + { + "epoch": 0.25809148869456155, + "grad_norm": 0.7144107425409536, + "learning_rate": 6.019987822397346e-07, + "loss": 1.4534, + "step": 3704 + }, + { + "epoch": 0.25816116782217885, + "grad_norm": 0.6853109997445458, + "learning_rate": 6.019487921480001e-07, + "loss": 1.5396, + "step": 3705 + }, + { + "epoch": 0.2582308469497962, + "grad_norm": 0.6840861044462495, + "learning_rate": 6.018987916596339e-07, + "loss": 1.5586, + "step": 3706 + }, + { + "epoch": 0.2583005260774135, + "grad_norm": 0.7329357026479503, + "learning_rate": 6.018487807770325e-07, + "loss": 1.5688, + "step": 3707 + }, + { + "epoch": 0.25837020520503085, + "grad_norm": 0.7701244503623086, + "learning_rate": 6.017987595025921e-07, + "loss": 1.6769, + "step": 3708 + }, + { + "epoch": 0.25843988433264814, + "grad_norm": 0.7615865858349096, + "learning_rate": 6.017487278387103e-07, + "loss": 1.6313, + "step": 3709 + }, + { + "epoch": 0.2585095634602655, + "grad_norm": 0.7481709300334196, + "learning_rate": 6.016986857877844e-07, + "loss": 1.6588, + "step": 3710 + }, + { + "epoch": 0.2585792425878828, + "grad_norm": 0.7067990272665308, + "learning_rate": 6.016486333522125e-07, + "loss": 1.6063, + "step": 3711 + }, + { + "epoch": 0.25864892171550014, + "grad_norm": 0.7049759736878695, + "learning_rate": 6.015985705343935e-07, + "loss": 1.6585, + "step": 3712 + }, + { + "epoch": 0.25871860084311743, + "grad_norm": 0.6755596694999774, + "learning_rate": 6.015484973367262e-07, + "loss": 1.4303, + "step": 3713 + }, + { + "epoch": 0.2587882799707348, + "grad_norm": 0.7357990096575269, + "learning_rate": 6.014984137616104e-07, + "loss": 1.5035, + "step": 3714 + }, + { + "epoch": 0.2588579590983521, + "grad_norm": 0.7120439456701165, + "learning_rate": 6.014483198114461e-07, + "loss": 1.5557, + "step": 3715 + }, + { + "epoch": 0.2589276382259694, + "grad_norm": 0.7012771032934041, + "learning_rate": 6.01398215488634e-07, + "loss": 1.5811, + "step": 3716 + }, + { + "epoch": 0.2589973173535867, + "grad_norm": 0.7305842165014204, + "learning_rate": 6.013481007955752e-07, + "loss": 1.5853, + "step": 3717 + }, + { + "epoch": 0.25906699648120407, + "grad_norm": 0.7919793887127279, + "learning_rate": 6.012979757346712e-07, + "loss": 1.5073, + "step": 3718 + }, + { + "epoch": 0.25913667560882137, + "grad_norm": 0.7068150998297997, + "learning_rate": 6.012478403083242e-07, + "loss": 1.5391, + "step": 3719 + }, + { + "epoch": 0.2592063547364387, + "grad_norm": 0.6993516094342813, + "learning_rate": 6.011976945189366e-07, + "loss": 1.6703, + "step": 3720 + }, + { + "epoch": 0.259276033864056, + "grad_norm": 0.7608863004195396, + "learning_rate": 6.011475383689118e-07, + "loss": 1.5703, + "step": 3721 + }, + { + "epoch": 0.25934571299167336, + "grad_norm": 0.6914592273320355, + "learning_rate": 6.010973718606531e-07, + "loss": 1.4446, + "step": 3722 + }, + { + "epoch": 0.25941539211929066, + "grad_norm": 0.6983881594270946, + "learning_rate": 6.010471949965648e-07, + "loss": 1.4893, + "step": 3723 + }, + { + "epoch": 0.259485071246908, + "grad_norm": 0.7046492246974446, + "learning_rate": 6.009970077790513e-07, + "loss": 1.5758, + "step": 3724 + }, + { + "epoch": 0.2595547503745253, + "grad_norm": 0.7441364234564669, + "learning_rate": 6.009468102105178e-07, + "loss": 1.4596, + "step": 3725 + }, + { + "epoch": 0.25962442950214265, + "grad_norm": 0.7656422173680353, + "learning_rate": 6.008966022933698e-07, + "loss": 1.6614, + "step": 3726 + }, + { + "epoch": 0.25969410862975995, + "grad_norm": 0.7559299822870974, + "learning_rate": 6.008463840300134e-07, + "loss": 1.5062, + "step": 3727 + }, + { + "epoch": 0.2597637877573773, + "grad_norm": 0.7669212531621459, + "learning_rate": 6.007961554228552e-07, + "loss": 1.7258, + "step": 3728 + }, + { + "epoch": 0.2598334668849946, + "grad_norm": 0.6772176348456944, + "learning_rate": 6.007459164743022e-07, + "loss": 1.4701, + "step": 3729 + }, + { + "epoch": 0.25990314601261194, + "grad_norm": 0.7499056635774635, + "learning_rate": 6.006956671867618e-07, + "loss": 1.6034, + "step": 3730 + }, + { + "epoch": 0.25997282514022924, + "grad_norm": 0.7225704922652993, + "learning_rate": 6.006454075626425e-07, + "loss": 1.5875, + "step": 3731 + }, + { + "epoch": 0.2600425042678466, + "grad_norm": 0.6980878984816783, + "learning_rate": 6.005951376043523e-07, + "loss": 1.5183, + "step": 3732 + }, + { + "epoch": 0.2601121833954639, + "grad_norm": 0.7491400607437037, + "learning_rate": 6.005448573143007e-07, + "loss": 1.5805, + "step": 3733 + }, + { + "epoch": 0.26018186252308123, + "grad_norm": 0.8043793521506405, + "learning_rate": 6.004945666948968e-07, + "loss": 1.5908, + "step": 3734 + }, + { + "epoch": 0.2602515416506985, + "grad_norm": 0.691452771123545, + "learning_rate": 6.004442657485511e-07, + "loss": 1.4322, + "step": 3735 + }, + { + "epoch": 0.2603212207783159, + "grad_norm": 0.7392515048446673, + "learning_rate": 6.003939544776738e-07, + "loss": 1.4845, + "step": 3736 + }, + { + "epoch": 0.26039089990593317, + "grad_norm": 0.7362991468882549, + "learning_rate": 6.003436328846759e-07, + "loss": 1.6551, + "step": 3737 + }, + { + "epoch": 0.2604605790335505, + "grad_norm": 0.7294508844401375, + "learning_rate": 6.002933009719691e-07, + "loss": 1.5713, + "step": 3738 + }, + { + "epoch": 0.2605302581611678, + "grad_norm": 0.7223413768091034, + "learning_rate": 6.002429587419654e-07, + "loss": 1.6509, + "step": 3739 + }, + { + "epoch": 0.26059993728878517, + "grad_norm": 0.7098332952131358, + "learning_rate": 6.001926061970771e-07, + "loss": 1.5559, + "step": 3740 + }, + { + "epoch": 0.26066961641640246, + "grad_norm": 0.683662666270985, + "learning_rate": 6.001422433397174e-07, + "loss": 1.5261, + "step": 3741 + }, + { + "epoch": 0.2607392955440198, + "grad_norm": 0.7534065865215693, + "learning_rate": 6.000918701722998e-07, + "loss": 1.498, + "step": 3742 + }, + { + "epoch": 0.2608089746716371, + "grad_norm": 0.6924815806601603, + "learning_rate": 6.00041486697238e-07, + "loss": 1.5978, + "step": 3743 + }, + { + "epoch": 0.26087865379925446, + "grad_norm": 0.739100535066475, + "learning_rate": 5.99991092916947e-07, + "loss": 1.5646, + "step": 3744 + }, + { + "epoch": 0.26094833292687175, + "grad_norm": 0.7083409366002702, + "learning_rate": 5.999406888338412e-07, + "loss": 1.5164, + "step": 3745 + }, + { + "epoch": 0.2610180120544891, + "grad_norm": 0.6906008402718623, + "learning_rate": 5.998902744503363e-07, + "loss": 1.5732, + "step": 3746 + }, + { + "epoch": 0.2610876911821064, + "grad_norm": 0.8148269080140913, + "learning_rate": 5.998398497688484e-07, + "loss": 1.5443, + "step": 3747 + }, + { + "epoch": 0.26115737030972375, + "grad_norm": 0.7406162957615721, + "learning_rate": 5.99789414791794e-07, + "loss": 1.5242, + "step": 3748 + }, + { + "epoch": 0.26122704943734104, + "grad_norm": 0.7937614876731822, + "learning_rate": 5.997389695215896e-07, + "loss": 1.5805, + "step": 3749 + }, + { + "epoch": 0.2612967285649584, + "grad_norm": 0.7573554802703404, + "learning_rate": 5.99688513960653e-07, + "loss": 1.5713, + "step": 3750 + }, + { + "epoch": 0.2613664076925757, + "grad_norm": 0.7646561069072213, + "learning_rate": 5.996380481114021e-07, + "loss": 1.5567, + "step": 3751 + }, + { + "epoch": 0.26143608682019304, + "grad_norm": 0.6612253926929647, + "learning_rate": 5.995875719762554e-07, + "loss": 1.5712, + "step": 3752 + }, + { + "epoch": 0.26150576594781033, + "grad_norm": 0.692424855543187, + "learning_rate": 5.995370855576315e-07, + "loss": 1.579, + "step": 3753 + }, + { + "epoch": 0.2615754450754277, + "grad_norm": 0.7442513843985062, + "learning_rate": 5.994865888579501e-07, + "loss": 1.4847, + "step": 3754 + }, + { + "epoch": 0.261645124203045, + "grad_norm": 0.7325976152475112, + "learning_rate": 5.994360818796312e-07, + "loss": 1.2739, + "step": 3755 + }, + { + "epoch": 0.2617148033306623, + "grad_norm": 0.6912059227547824, + "learning_rate": 5.993855646250948e-07, + "loss": 1.4752, + "step": 3756 + }, + { + "epoch": 0.2617844824582796, + "grad_norm": 0.7352347757664525, + "learning_rate": 5.993350370967621e-07, + "loss": 1.5596, + "step": 3757 + }, + { + "epoch": 0.26185416158589697, + "grad_norm": 0.6756060946917455, + "learning_rate": 5.992844992970544e-07, + "loss": 1.6212, + "step": 3758 + }, + { + "epoch": 0.26192384071351427, + "grad_norm": 0.6798226503928068, + "learning_rate": 5.992339512283936e-07, + "loss": 1.4912, + "step": 3759 + }, + { + "epoch": 0.26199351984113156, + "grad_norm": 0.7513251410438295, + "learning_rate": 5.991833928932022e-07, + "loss": 1.6829, + "step": 3760 + }, + { + "epoch": 0.2620631989687489, + "grad_norm": 0.6865988084084071, + "learning_rate": 5.991328242939027e-07, + "loss": 1.4194, + "step": 3761 + }, + { + "epoch": 0.2621328780963662, + "grad_norm": 0.6832901524123712, + "learning_rate": 5.990822454329185e-07, + "loss": 1.4876, + "step": 3762 + }, + { + "epoch": 0.26220255722398356, + "grad_norm": 0.7563513546614019, + "learning_rate": 5.990316563126739e-07, + "loss": 1.5627, + "step": 3763 + }, + { + "epoch": 0.26227223635160085, + "grad_norm": 0.7891804562004142, + "learning_rate": 5.989810569355928e-07, + "loss": 1.5799, + "step": 3764 + }, + { + "epoch": 0.2623419154792182, + "grad_norm": 0.7740143811084153, + "learning_rate": 5.989304473041002e-07, + "loss": 1.5068, + "step": 3765 + }, + { + "epoch": 0.2624115946068355, + "grad_norm": 0.6755385725152862, + "learning_rate": 5.988798274206213e-07, + "loss": 1.5628, + "step": 3766 + }, + { + "epoch": 0.26248127373445285, + "grad_norm": 0.67862127255677, + "learning_rate": 5.98829197287582e-07, + "loss": 1.4632, + "step": 3767 + }, + { + "epoch": 0.26255095286207014, + "grad_norm": 0.7277909881984042, + "learning_rate": 5.987785569074086e-07, + "loss": 1.4731, + "step": 3768 + }, + { + "epoch": 0.2626206319896875, + "grad_norm": 0.7746379657241323, + "learning_rate": 5.987279062825278e-07, + "loss": 1.5755, + "step": 3769 + }, + { + "epoch": 0.2626903111173048, + "grad_norm": 0.7188812385609231, + "learning_rate": 5.986772454153671e-07, + "loss": 1.5729, + "step": 3770 + }, + { + "epoch": 0.26275999024492214, + "grad_norm": 0.7012551759451631, + "learning_rate": 5.98626574308354e-07, + "loss": 1.579, + "step": 3771 + }, + { + "epoch": 0.26282966937253943, + "grad_norm": 0.6879359396906378, + "learning_rate": 5.985758929639171e-07, + "loss": 1.5858, + "step": 3772 + }, + { + "epoch": 0.2628993485001568, + "grad_norm": 0.7367205083171565, + "learning_rate": 5.985252013844848e-07, + "loss": 1.6155, + "step": 3773 + }, + { + "epoch": 0.2629690276277741, + "grad_norm": 0.6755010882852025, + "learning_rate": 5.984744995724865e-07, + "loss": 1.544, + "step": 3774 + }, + { + "epoch": 0.2630387067553914, + "grad_norm": 0.7220361856984341, + "learning_rate": 5.984237875303518e-07, + "loss": 1.5533, + "step": 3775 + }, + { + "epoch": 0.2631083858830087, + "grad_norm": 0.6939224696872534, + "learning_rate": 5.983730652605112e-07, + "loss": 1.5252, + "step": 3776 + }, + { + "epoch": 0.26317806501062607, + "grad_norm": 0.7790602795147022, + "learning_rate": 5.983223327653953e-07, + "loss": 1.5188, + "step": 3777 + }, + { + "epoch": 0.26324774413824337, + "grad_norm": 0.6997604006917725, + "learning_rate": 5.982715900474351e-07, + "loss": 1.5381, + "step": 3778 + }, + { + "epoch": 0.2633174232658607, + "grad_norm": 0.7261413109184857, + "learning_rate": 5.982208371090626e-07, + "loss": 1.5845, + "step": 3779 + }, + { + "epoch": 0.263387102393478, + "grad_norm": 0.7014290816886654, + "learning_rate": 5.981700739527099e-07, + "loss": 1.4236, + "step": 3780 + }, + { + "epoch": 0.26345678152109536, + "grad_norm": 0.7262834750193699, + "learning_rate": 5.981193005808095e-07, + "loss": 1.5169, + "step": 3781 + }, + { + "epoch": 0.26352646064871266, + "grad_norm": 0.7254743030652147, + "learning_rate": 5.980685169957948e-07, + "loss": 1.5322, + "step": 3782 + }, + { + "epoch": 0.26359613977633, + "grad_norm": 0.7169497660477526, + "learning_rate": 5.980177232000992e-07, + "loss": 1.6798, + "step": 3783 + }, + { + "epoch": 0.2636658189039473, + "grad_norm": 0.7078883148268486, + "learning_rate": 5.979669191961572e-07, + "loss": 1.5295, + "step": 3784 + }, + { + "epoch": 0.26373549803156465, + "grad_norm": 0.6876144944709754, + "learning_rate": 5.979161049864031e-07, + "loss": 1.4183, + "step": 3785 + }, + { + "epoch": 0.26380517715918195, + "grad_norm": 0.7007535327284083, + "learning_rate": 5.978652805732721e-07, + "loss": 1.5592, + "step": 3786 + }, + { + "epoch": 0.2638748562867993, + "grad_norm": 0.7329176497548947, + "learning_rate": 5.978144459591999e-07, + "loss": 1.6272, + "step": 3787 + }, + { + "epoch": 0.2639445354144166, + "grad_norm": 0.7552545836313325, + "learning_rate": 5.977636011466225e-07, + "loss": 1.6171, + "step": 3788 + }, + { + "epoch": 0.26401421454203394, + "grad_norm": 0.7909895832044823, + "learning_rate": 5.977127461379767e-07, + "loss": 1.586, + "step": 3789 + }, + { + "epoch": 0.26408389366965124, + "grad_norm": 0.7172742828183447, + "learning_rate": 5.976618809356991e-07, + "loss": 1.5546, + "step": 3790 + }, + { + "epoch": 0.2641535727972686, + "grad_norm": 0.7377383764434615, + "learning_rate": 5.976110055422278e-07, + "loss": 1.592, + "step": 3791 + }, + { + "epoch": 0.2642232519248859, + "grad_norm": 0.7335537641666481, + "learning_rate": 5.975601199600006e-07, + "loss": 1.7052, + "step": 3792 + }, + { + "epoch": 0.26429293105250323, + "grad_norm": 0.7697742371966286, + "learning_rate": 5.975092241914562e-07, + "loss": 1.5435, + "step": 3793 + }, + { + "epoch": 0.2643626101801205, + "grad_norm": 0.7501305244135283, + "learning_rate": 5.974583182390333e-07, + "loss": 1.4065, + "step": 3794 + }, + { + "epoch": 0.2644322893077379, + "grad_norm": 0.721222982117427, + "learning_rate": 5.974074021051717e-07, + "loss": 1.6465, + "step": 3795 + }, + { + "epoch": 0.2645019684353552, + "grad_norm": 0.7255111363489318, + "learning_rate": 5.973564757923113e-07, + "loss": 1.5097, + "step": 3796 + }, + { + "epoch": 0.2645716475629725, + "grad_norm": 0.7725503160769095, + "learning_rate": 5.973055393028927e-07, + "loss": 1.5885, + "step": 3797 + }, + { + "epoch": 0.2646413266905898, + "grad_norm": 0.722794112667975, + "learning_rate": 5.972545926393567e-07, + "loss": 1.4518, + "step": 3798 + }, + { + "epoch": 0.26471100581820717, + "grad_norm": 0.750610046876159, + "learning_rate": 5.97203635804145e-07, + "loss": 1.4518, + "step": 3799 + }, + { + "epoch": 0.26478068494582446, + "grad_norm": 0.7373683509483345, + "learning_rate": 5.971526687996992e-07, + "loss": 1.6069, + "step": 3800 + }, + { + "epoch": 0.2648503640734418, + "grad_norm": 0.7197788095293496, + "learning_rate": 5.971016916284623e-07, + "loss": 1.4767, + "step": 3801 + }, + { + "epoch": 0.2649200432010591, + "grad_norm": 0.7489669567408815, + "learning_rate": 5.970507042928765e-07, + "loss": 1.5974, + "step": 3802 + }, + { + "epoch": 0.26498972232867646, + "grad_norm": 0.7200758831805548, + "learning_rate": 5.969997067953859e-07, + "loss": 1.4463, + "step": 3803 + }, + { + "epoch": 0.26505940145629375, + "grad_norm": 0.7146080622081296, + "learning_rate": 5.969486991384342e-07, + "loss": 1.4771, + "step": 3804 + }, + { + "epoch": 0.2651290805839111, + "grad_norm": 0.712843526792142, + "learning_rate": 5.968976813244654e-07, + "loss": 1.5082, + "step": 3805 + }, + { + "epoch": 0.2651987597115284, + "grad_norm": 0.7322880790478435, + "learning_rate": 5.968466533559249e-07, + "loss": 1.517, + "step": 3806 + }, + { + "epoch": 0.26526843883914575, + "grad_norm": 0.7157357524960127, + "learning_rate": 5.967956152352578e-07, + "loss": 1.6056, + "step": 3807 + }, + { + "epoch": 0.26533811796676304, + "grad_norm": 0.7047036158964357, + "learning_rate": 5.967445669649101e-07, + "loss": 1.4682, + "step": 3808 + }, + { + "epoch": 0.2654077970943804, + "grad_norm": 0.7234814253316819, + "learning_rate": 5.96693508547328e-07, + "loss": 1.4715, + "step": 3809 + }, + { + "epoch": 0.2654774762219977, + "grad_norm": 0.7030356586208922, + "learning_rate": 5.966424399849583e-07, + "loss": 1.5992, + "step": 3810 + }, + { + "epoch": 0.26554715534961504, + "grad_norm": 0.7439713742620705, + "learning_rate": 5.965913612802485e-07, + "loss": 1.5361, + "step": 3811 + }, + { + "epoch": 0.26561683447723233, + "grad_norm": 0.739432309777115, + "learning_rate": 5.965402724356462e-07, + "loss": 1.6648, + "step": 3812 + }, + { + "epoch": 0.2656865136048497, + "grad_norm": 0.7282252553032253, + "learning_rate": 5.964891734535997e-07, + "loss": 1.5186, + "step": 3813 + }, + { + "epoch": 0.265756192732467, + "grad_norm": 0.7404702107542114, + "learning_rate": 5.964380643365579e-07, + "loss": 1.6763, + "step": 3814 + }, + { + "epoch": 0.26582587186008433, + "grad_norm": 0.7245406478505013, + "learning_rate": 5.9638694508697e-07, + "loss": 1.5752, + "step": 3815 + }, + { + "epoch": 0.2658955509877016, + "grad_norm": 0.7015576237292017, + "learning_rate": 5.963358157072858e-07, + "loss": 1.5034, + "step": 3816 + }, + { + "epoch": 0.265965230115319, + "grad_norm": 0.7258510311401303, + "learning_rate": 5.962846761999553e-07, + "loss": 1.499, + "step": 3817 + }, + { + "epoch": 0.26603490924293627, + "grad_norm": 0.7092976051769906, + "learning_rate": 5.962335265674295e-07, + "loss": 1.5071, + "step": 3818 + }, + { + "epoch": 0.2661045883705536, + "grad_norm": 0.7026549998783386, + "learning_rate": 5.961823668121593e-07, + "loss": 1.444, + "step": 3819 + }, + { + "epoch": 0.2661742674981709, + "grad_norm": 0.7836415542286314, + "learning_rate": 5.961311969365966e-07, + "loss": 1.6607, + "step": 3820 + }, + { + "epoch": 0.26624394662578826, + "grad_norm": 0.7517676503653571, + "learning_rate": 5.960800169431935e-07, + "loss": 1.6165, + "step": 3821 + }, + { + "epoch": 0.26631362575340556, + "grad_norm": 0.7071949248379115, + "learning_rate": 5.960288268344027e-07, + "loss": 1.6095, + "step": 3822 + }, + { + "epoch": 0.2663833048810229, + "grad_norm": 0.7524462940338275, + "learning_rate": 5.959776266126772e-07, + "loss": 1.5655, + "step": 3823 + }, + { + "epoch": 0.2664529840086402, + "grad_norm": 0.7905471950668109, + "learning_rate": 5.959264162804707e-07, + "loss": 1.5676, + "step": 3824 + }, + { + "epoch": 0.26652266313625755, + "grad_norm": 0.7576424220733925, + "learning_rate": 5.958751958402374e-07, + "loss": 1.5781, + "step": 3825 + }, + { + "epoch": 0.26659234226387485, + "grad_norm": 0.7559598623005291, + "learning_rate": 5.958239652944317e-07, + "loss": 1.3891, + "step": 3826 + }, + { + "epoch": 0.2666620213914922, + "grad_norm": 0.7388353146062195, + "learning_rate": 5.957727246455088e-07, + "loss": 1.399, + "step": 3827 + }, + { + "epoch": 0.2667317005191095, + "grad_norm": 0.7097430840949527, + "learning_rate": 5.957214738959243e-07, + "loss": 1.4865, + "step": 3828 + }, + { + "epoch": 0.26680137964672684, + "grad_norm": 0.7271421482613403, + "learning_rate": 5.95670213048134e-07, + "loss": 1.4648, + "step": 3829 + }, + { + "epoch": 0.26687105877434414, + "grad_norm": 0.7505684797204005, + "learning_rate": 5.956189421045947e-07, + "loss": 1.5283, + "step": 3830 + }, + { + "epoch": 0.2669407379019615, + "grad_norm": 0.7260488787468974, + "learning_rate": 5.955676610677633e-07, + "loss": 1.4497, + "step": 3831 + }, + { + "epoch": 0.2670104170295788, + "grad_norm": 0.7848335820145739, + "learning_rate": 5.955163699400973e-07, + "loss": 1.8227, + "step": 3832 + }, + { + "epoch": 0.26708009615719613, + "grad_norm": 0.6623961844248426, + "learning_rate": 5.954650687240547e-07, + "loss": 1.5249, + "step": 3833 + }, + { + "epoch": 0.26714977528481343, + "grad_norm": 0.6783606484283297, + "learning_rate": 5.954137574220939e-07, + "loss": 1.5641, + "step": 3834 + }, + { + "epoch": 0.2672194544124308, + "grad_norm": 0.6901063180908451, + "learning_rate": 5.953624360366739e-07, + "loss": 1.4457, + "step": 3835 + }, + { + "epoch": 0.2672891335400481, + "grad_norm": 0.6583457935445118, + "learning_rate": 5.95311104570254e-07, + "loss": 1.481, + "step": 3836 + }, + { + "epoch": 0.2673588126676654, + "grad_norm": 0.7459974223450113, + "learning_rate": 5.952597630252943e-07, + "loss": 1.6124, + "step": 3837 + }, + { + "epoch": 0.2674284917952827, + "grad_norm": 0.7200644745261434, + "learning_rate": 5.952084114042551e-07, + "loss": 1.5149, + "step": 3838 + }, + { + "epoch": 0.26749817092290007, + "grad_norm": 0.8150337193713256, + "learning_rate": 5.951570497095973e-07, + "loss": 1.5111, + "step": 3839 + }, + { + "epoch": 0.26756785005051736, + "grad_norm": 0.6647113024228307, + "learning_rate": 5.951056779437821e-07, + "loss": 1.5736, + "step": 3840 + }, + { + "epoch": 0.2676375291781347, + "grad_norm": 0.6977037781613992, + "learning_rate": 5.950542961092715e-07, + "loss": 1.4829, + "step": 3841 + }, + { + "epoch": 0.267707208305752, + "grad_norm": 0.7263040973123264, + "learning_rate": 5.950029042085279e-07, + "loss": 1.4914, + "step": 3842 + }, + { + "epoch": 0.26777688743336936, + "grad_norm": 0.7059006018144998, + "learning_rate": 5.949515022440141e-07, + "loss": 1.4649, + "step": 3843 + }, + { + "epoch": 0.26784656656098665, + "grad_norm": 0.7146294619304026, + "learning_rate": 5.949000902181932e-07, + "loss": 1.5275, + "step": 3844 + }, + { + "epoch": 0.267916245688604, + "grad_norm": 0.7528834166368202, + "learning_rate": 5.948486681335289e-07, + "loss": 1.4976, + "step": 3845 + }, + { + "epoch": 0.2679859248162213, + "grad_norm": 0.6506852625680217, + "learning_rate": 5.947972359924857e-07, + "loss": 1.4088, + "step": 3846 + }, + { + "epoch": 0.26805560394383865, + "grad_norm": 0.7112999509070153, + "learning_rate": 5.947457937975282e-07, + "loss": 1.5822, + "step": 3847 + }, + { + "epoch": 0.26812528307145594, + "grad_norm": 0.6937904601326073, + "learning_rate": 5.946943415511218e-07, + "loss": 1.5623, + "step": 3848 + }, + { + "epoch": 0.2681949621990733, + "grad_norm": 0.7137558963987013, + "learning_rate": 5.946428792557321e-07, + "loss": 1.535, + "step": 3849 + }, + { + "epoch": 0.2682646413266906, + "grad_norm": 0.6923843161735229, + "learning_rate": 5.94591406913825e-07, + "loss": 1.4935, + "step": 3850 + }, + { + "epoch": 0.26833432045430794, + "grad_norm": 0.6818469415482451, + "learning_rate": 5.945399245278675e-07, + "loss": 1.5496, + "step": 3851 + }, + { + "epoch": 0.26840399958192523, + "grad_norm": 0.6965188523204926, + "learning_rate": 5.944884321003267e-07, + "loss": 1.4331, + "step": 3852 + }, + { + "epoch": 0.26847367870954253, + "grad_norm": 0.7738072123350715, + "learning_rate": 5.944369296336701e-07, + "loss": 1.782, + "step": 3853 + }, + { + "epoch": 0.2685433578371599, + "grad_norm": 0.7172882849130879, + "learning_rate": 5.943854171303659e-07, + "loss": 1.5333, + "step": 3854 + }, + { + "epoch": 0.2686130369647772, + "grad_norm": 0.7399299275792292, + "learning_rate": 5.943338945928827e-07, + "loss": 1.4818, + "step": 3855 + }, + { + "epoch": 0.2686827160923945, + "grad_norm": 0.7494982746550088, + "learning_rate": 5.942823620236894e-07, + "loss": 1.5849, + "step": 3856 + }, + { + "epoch": 0.2687523952200118, + "grad_norm": 1.3592300358320584, + "learning_rate": 5.942308194252557e-07, + "loss": 1.6021, + "step": 3857 + }, + { + "epoch": 0.26882207434762917, + "grad_norm": 0.6870898200709619, + "learning_rate": 5.941792668000517e-07, + "loss": 1.5074, + "step": 3858 + }, + { + "epoch": 0.26889175347524646, + "grad_norm": 0.7256808114459625, + "learning_rate": 5.941277041505477e-07, + "loss": 1.5594, + "step": 3859 + }, + { + "epoch": 0.2689614326028638, + "grad_norm": 0.6883037203550751, + "learning_rate": 5.940761314792148e-07, + "loss": 1.5388, + "step": 3860 + }, + { + "epoch": 0.2690311117304811, + "grad_norm": 0.7187263352455807, + "learning_rate": 5.940245487885244e-07, + "loss": 1.4854, + "step": 3861 + }, + { + "epoch": 0.26910079085809846, + "grad_norm": 0.7166388609160896, + "learning_rate": 5.939729560809486e-07, + "loss": 1.4961, + "step": 3862 + }, + { + "epoch": 0.26917046998571575, + "grad_norm": 0.7070290799262712, + "learning_rate": 5.939213533589596e-07, + "loss": 1.5669, + "step": 3863 + }, + { + "epoch": 0.2692401491133331, + "grad_norm": 0.6967168983725861, + "learning_rate": 5.938697406250307e-07, + "loss": 1.3717, + "step": 3864 + }, + { + "epoch": 0.2693098282409504, + "grad_norm": 0.681060189861049, + "learning_rate": 5.938181178816349e-07, + "loss": 1.5998, + "step": 3865 + }, + { + "epoch": 0.26937950736856775, + "grad_norm": 0.8336457379589953, + "learning_rate": 5.93766485131246e-07, + "loss": 1.5647, + "step": 3866 + }, + { + "epoch": 0.26944918649618504, + "grad_norm": 0.6879300359719382, + "learning_rate": 5.937148423763387e-07, + "loss": 1.4075, + "step": 3867 + }, + { + "epoch": 0.2695188656238024, + "grad_norm": 0.7410474852761608, + "learning_rate": 5.936631896193877e-07, + "loss": 1.5457, + "step": 3868 + }, + { + "epoch": 0.2695885447514197, + "grad_norm": 0.7548080374598897, + "learning_rate": 5.936115268628682e-07, + "loss": 1.5415, + "step": 3869 + }, + { + "epoch": 0.26965822387903704, + "grad_norm": 0.7264228100243976, + "learning_rate": 5.935598541092561e-07, + "loss": 1.5861, + "step": 3870 + }, + { + "epoch": 0.26972790300665433, + "grad_norm": 0.7722431128355013, + "learning_rate": 5.935081713610277e-07, + "loss": 1.6332, + "step": 3871 + }, + { + "epoch": 0.2697975821342717, + "grad_norm": 0.754648281167634, + "learning_rate": 5.934564786206595e-07, + "loss": 1.5744, + "step": 3872 + }, + { + "epoch": 0.269867261261889, + "grad_norm": 0.6926936779814764, + "learning_rate": 5.934047758906291e-07, + "loss": 1.4116, + "step": 3873 + }, + { + "epoch": 0.26993694038950633, + "grad_norm": 0.7420917039064506, + "learning_rate": 5.933530631734138e-07, + "loss": 1.5542, + "step": 3874 + }, + { + "epoch": 0.2700066195171236, + "grad_norm": 0.7718671864174498, + "learning_rate": 5.93301340471492e-07, + "loss": 1.5018, + "step": 3875 + }, + { + "epoch": 0.270076298644741, + "grad_norm": 0.7436769872040874, + "learning_rate": 5.932496077873425e-07, + "loss": 1.5629, + "step": 3876 + }, + { + "epoch": 0.27014597777235827, + "grad_norm": 0.7024002295532155, + "learning_rate": 5.93197865123444e-07, + "loss": 1.5561, + "step": 3877 + }, + { + "epoch": 0.2702156568999756, + "grad_norm": 0.7036180595380055, + "learning_rate": 5.931461124822766e-07, + "loss": 1.4487, + "step": 3878 + }, + { + "epoch": 0.2702853360275929, + "grad_norm": 0.7291033209991897, + "learning_rate": 5.9309434986632e-07, + "loss": 1.6397, + "step": 3879 + }, + { + "epoch": 0.27035501515521027, + "grad_norm": 0.7270278064368998, + "learning_rate": 5.930425772780551e-07, + "loss": 1.6212, + "step": 3880 + }, + { + "epoch": 0.27042469428282756, + "grad_norm": 0.7196922674297855, + "learning_rate": 5.929907947199628e-07, + "loss": 1.5628, + "step": 3881 + }, + { + "epoch": 0.2704943734104449, + "grad_norm": 0.7405096746089026, + "learning_rate": 5.929390021945246e-07, + "loss": 1.3544, + "step": 3882 + }, + { + "epoch": 0.2705640525380622, + "grad_norm": 0.7450568923816304, + "learning_rate": 5.928871997042224e-07, + "loss": 1.4874, + "step": 3883 + }, + { + "epoch": 0.27063373166567956, + "grad_norm": 0.7056369007831684, + "learning_rate": 5.928353872515389e-07, + "loss": 1.4927, + "step": 3884 + }, + { + "epoch": 0.27070341079329685, + "grad_norm": 0.761581874927409, + "learning_rate": 5.92783564838957e-07, + "loss": 1.5912, + "step": 3885 + }, + { + "epoch": 0.2707730899209142, + "grad_norm": 0.6921846983017318, + "learning_rate": 5.9273173246896e-07, + "loss": 1.5517, + "step": 3886 + }, + { + "epoch": 0.2708427690485315, + "grad_norm": 0.7002922072429691, + "learning_rate": 5.926798901440321e-07, + "loss": 1.5203, + "step": 3887 + }, + { + "epoch": 0.27091244817614885, + "grad_norm": 0.683729525033674, + "learning_rate": 5.926280378666573e-07, + "loss": 1.4792, + "step": 3888 + }, + { + "epoch": 0.27098212730376614, + "grad_norm": 0.6414599752660434, + "learning_rate": 5.925761756393207e-07, + "loss": 1.5579, + "step": 3889 + }, + { + "epoch": 0.2710518064313835, + "grad_norm": 0.7010029702153505, + "learning_rate": 5.925243034645077e-07, + "loss": 1.3604, + "step": 3890 + }, + { + "epoch": 0.2711214855590008, + "grad_norm": 0.6772803213294848, + "learning_rate": 5.92472421344704e-07, + "loss": 1.4497, + "step": 3891 + }, + { + "epoch": 0.27119116468661814, + "grad_norm": 0.7423859825107686, + "learning_rate": 5.92420529282396e-07, + "loss": 1.6493, + "step": 3892 + }, + { + "epoch": 0.27126084381423543, + "grad_norm": 0.7069407783189319, + "learning_rate": 5.923686272800703e-07, + "loss": 1.4724, + "step": 3893 + }, + { + "epoch": 0.2713305229418528, + "grad_norm": 0.7288513991064071, + "learning_rate": 5.923167153402144e-07, + "loss": 1.6363, + "step": 3894 + }, + { + "epoch": 0.2714002020694701, + "grad_norm": 0.6983283286669905, + "learning_rate": 5.922647934653158e-07, + "loss": 1.5045, + "step": 3895 + }, + { + "epoch": 0.2714698811970874, + "grad_norm": 0.729073744684144, + "learning_rate": 5.922128616578627e-07, + "loss": 1.497, + "step": 3896 + }, + { + "epoch": 0.2715395603247047, + "grad_norm": 0.6998014105535623, + "learning_rate": 5.92160919920344e-07, + "loss": 1.6703, + "step": 3897 + }, + { + "epoch": 0.27160923945232207, + "grad_norm": 0.7285550394088254, + "learning_rate": 5.921089682552487e-07, + "loss": 1.5291, + "step": 3898 + }, + { + "epoch": 0.27167891857993937, + "grad_norm": 0.7584854068310711, + "learning_rate": 5.920570066650665e-07, + "loss": 1.4431, + "step": 3899 + }, + { + "epoch": 0.2717485977075567, + "grad_norm": 0.7782366181505483, + "learning_rate": 5.920050351522874e-07, + "loss": 1.5376, + "step": 3900 + }, + { + "epoch": 0.271818276835174, + "grad_norm": 0.7645158556386534, + "learning_rate": 5.91953053719402e-07, + "loss": 1.6062, + "step": 3901 + }, + { + "epoch": 0.27188795596279136, + "grad_norm": 0.6900052460278817, + "learning_rate": 5.919010623689015e-07, + "loss": 1.5655, + "step": 3902 + }, + { + "epoch": 0.27195763509040866, + "grad_norm": 0.7540368159165663, + "learning_rate": 5.918490611032772e-07, + "loss": 1.6374, + "step": 3903 + }, + { + "epoch": 0.272027314218026, + "grad_norm": 0.7339110598734926, + "learning_rate": 5.917970499250214e-07, + "loss": 1.6146, + "step": 3904 + }, + { + "epoch": 0.2720969933456433, + "grad_norm": 0.7138874824396794, + "learning_rate": 5.917450288366263e-07, + "loss": 1.5524, + "step": 3905 + }, + { + "epoch": 0.27216667247326065, + "grad_norm": 0.6772495253223015, + "learning_rate": 5.916929978405849e-07, + "loss": 1.6166, + "step": 3906 + }, + { + "epoch": 0.27223635160087795, + "grad_norm": 0.7887647513498364, + "learning_rate": 5.916409569393909e-07, + "loss": 1.5605, + "step": 3907 + }, + { + "epoch": 0.2723060307284953, + "grad_norm": 0.6988054387986751, + "learning_rate": 5.915889061355379e-07, + "loss": 1.5141, + "step": 3908 + }, + { + "epoch": 0.2723757098561126, + "grad_norm": 0.7296767353519555, + "learning_rate": 5.915368454315205e-07, + "loss": 1.5563, + "step": 3909 + }, + { + "epoch": 0.27244538898372994, + "grad_norm": 0.7497446074632108, + "learning_rate": 5.914847748298333e-07, + "loss": 1.6255, + "step": 3910 + }, + { + "epoch": 0.27251506811134724, + "grad_norm": 0.6639192068050025, + "learning_rate": 5.914326943329719e-07, + "loss": 1.5285, + "step": 3911 + }, + { + "epoch": 0.2725847472389646, + "grad_norm": 0.7428035774613251, + "learning_rate": 5.913806039434321e-07, + "loss": 1.4783, + "step": 3912 + }, + { + "epoch": 0.2726544263665819, + "grad_norm": 0.7281922760552719, + "learning_rate": 5.913285036637098e-07, + "loss": 1.5469, + "step": 3913 + }, + { + "epoch": 0.27272410549419923, + "grad_norm": 0.7333014735087869, + "learning_rate": 5.912763934963022e-07, + "loss": 1.592, + "step": 3914 + }, + { + "epoch": 0.2727937846218165, + "grad_norm": 0.7798965806035877, + "learning_rate": 5.912242734437064e-07, + "loss": 1.6835, + "step": 3915 + }, + { + "epoch": 0.2728634637494339, + "grad_norm": 0.7536671752599499, + "learning_rate": 5.911721435084199e-07, + "loss": 1.7395, + "step": 3916 + }, + { + "epoch": 0.27293314287705117, + "grad_norm": 0.6902780463980486, + "learning_rate": 5.911200036929411e-07, + "loss": 1.6335, + "step": 3917 + }, + { + "epoch": 0.2730028220046685, + "grad_norm": 0.6967623275559346, + "learning_rate": 5.910678539997686e-07, + "loss": 1.5714, + "step": 3918 + }, + { + "epoch": 0.2730725011322858, + "grad_norm": 0.7397568053887077, + "learning_rate": 5.910156944314013e-07, + "loss": 1.6657, + "step": 3919 + }, + { + "epoch": 0.27314218025990317, + "grad_norm": 0.701659478786721, + "learning_rate": 5.909635249903391e-07, + "loss": 1.5462, + "step": 3920 + }, + { + "epoch": 0.27321185938752046, + "grad_norm": 0.7725419571633488, + "learning_rate": 5.90911345679082e-07, + "loss": 1.6394, + "step": 3921 + }, + { + "epoch": 0.2732815385151378, + "grad_norm": 0.6980051136129517, + "learning_rate": 5.908591565001304e-07, + "loss": 1.4364, + "step": 3922 + }, + { + "epoch": 0.2733512176427551, + "grad_norm": 0.8135918312405112, + "learning_rate": 5.908069574559854e-07, + "loss": 1.5069, + "step": 3923 + }, + { + "epoch": 0.27342089677037246, + "grad_norm": 0.7053317409211138, + "learning_rate": 5.907547485491487e-07, + "loss": 1.5591, + "step": 3924 + }, + { + "epoch": 0.27349057589798975, + "grad_norm": 0.7018480395090504, + "learning_rate": 5.907025297821218e-07, + "loss": 1.5574, + "step": 3925 + }, + { + "epoch": 0.2735602550256071, + "grad_norm": 0.7720773415373251, + "learning_rate": 5.906503011574075e-07, + "loss": 1.4853, + "step": 3926 + }, + { + "epoch": 0.2736299341532244, + "grad_norm": 0.6976717600818462, + "learning_rate": 5.905980626775085e-07, + "loss": 1.542, + "step": 3927 + }, + { + "epoch": 0.27369961328084175, + "grad_norm": 0.7211622466811443, + "learning_rate": 5.905458143449282e-07, + "loss": 1.6, + "step": 3928 + }, + { + "epoch": 0.27376929240845904, + "grad_norm": 0.695676948180807, + "learning_rate": 5.904935561621705e-07, + "loss": 1.61, + "step": 3929 + }, + { + "epoch": 0.2738389715360764, + "grad_norm": 0.7392444394382608, + "learning_rate": 5.904412881317398e-07, + "loss": 1.4697, + "step": 3930 + }, + { + "epoch": 0.2739086506636937, + "grad_norm": 0.7379040097850592, + "learning_rate": 5.903890102561409e-07, + "loss": 1.6316, + "step": 3931 + }, + { + "epoch": 0.27397832979131104, + "grad_norm": 0.6838537018000045, + "learning_rate": 5.903367225378788e-07, + "loss": 1.508, + "step": 3932 + }, + { + "epoch": 0.27404800891892833, + "grad_norm": 0.723306590283538, + "learning_rate": 5.902844249794595e-07, + "loss": 1.5367, + "step": 3933 + }, + { + "epoch": 0.2741176880465457, + "grad_norm": 0.7380752803903438, + "learning_rate": 5.902321175833891e-07, + "loss": 1.5289, + "step": 3934 + }, + { + "epoch": 0.274187367174163, + "grad_norm": 0.7063438845989196, + "learning_rate": 5.901798003521742e-07, + "loss": 1.4684, + "step": 3935 + }, + { + "epoch": 0.2742570463017803, + "grad_norm": 0.7569904066641995, + "learning_rate": 5.901274732883223e-07, + "loss": 1.5033, + "step": 3936 + }, + { + "epoch": 0.2743267254293976, + "grad_norm": 0.7296337737834608, + "learning_rate": 5.900751363943405e-07, + "loss": 1.5454, + "step": 3937 + }, + { + "epoch": 0.274396404557015, + "grad_norm": 0.7029267716615997, + "learning_rate": 5.900227896727372e-07, + "loss": 1.5316, + "step": 3938 + }, + { + "epoch": 0.27446608368463227, + "grad_norm": 0.7178752281258443, + "learning_rate": 5.89970433126021e-07, + "loss": 1.5698, + "step": 3939 + }, + { + "epoch": 0.2745357628122496, + "grad_norm": 0.7567086642839586, + "learning_rate": 5.899180667567008e-07, + "loss": 1.516, + "step": 3940 + }, + { + "epoch": 0.2746054419398669, + "grad_norm": 0.6923104599232259, + "learning_rate": 5.898656905672861e-07, + "loss": 1.5541, + "step": 3941 + }, + { + "epoch": 0.27467512106748426, + "grad_norm": 0.710308756305562, + "learning_rate": 5.89813304560287e-07, + "loss": 1.5168, + "step": 3942 + }, + { + "epoch": 0.27474480019510156, + "grad_norm": 0.7618485460806338, + "learning_rate": 5.897609087382139e-07, + "loss": 1.5854, + "step": 3943 + }, + { + "epoch": 0.27481447932271885, + "grad_norm": 0.7897506539988088, + "learning_rate": 5.897085031035776e-07, + "loss": 1.5347, + "step": 3944 + }, + { + "epoch": 0.2748841584503362, + "grad_norm": 0.7576815455048657, + "learning_rate": 5.896560876588897e-07, + "loss": 1.5786, + "step": 3945 + }, + { + "epoch": 0.2749538375779535, + "grad_norm": 0.6614648949215904, + "learning_rate": 5.896036624066618e-07, + "loss": 1.429, + "step": 3946 + }, + { + "epoch": 0.27502351670557085, + "grad_norm": 0.7016203280395538, + "learning_rate": 5.895512273494064e-07, + "loss": 1.5163, + "step": 3947 + }, + { + "epoch": 0.27509319583318814, + "grad_norm": 0.6931782798530423, + "learning_rate": 5.894987824896362e-07, + "loss": 1.5627, + "step": 3948 + }, + { + "epoch": 0.2751628749608055, + "grad_norm": 0.7451596533117754, + "learning_rate": 5.894463278298647e-07, + "loss": 1.559, + "step": 3949 + }, + { + "epoch": 0.2752325540884228, + "grad_norm": 0.8392928248647462, + "learning_rate": 5.893938633726052e-07, + "loss": 1.5077, + "step": 3950 + }, + { + "epoch": 0.27530223321604014, + "grad_norm": 0.7496311483780644, + "learning_rate": 5.893413891203723e-07, + "loss": 1.6871, + "step": 3951 + }, + { + "epoch": 0.27537191234365743, + "grad_norm": 0.6866597786792935, + "learning_rate": 5.892889050756805e-07, + "loss": 1.5015, + "step": 3952 + }, + { + "epoch": 0.2754415914712748, + "grad_norm": 0.7539980443871873, + "learning_rate": 5.89236411241045e-07, + "loss": 1.5783, + "step": 3953 + }, + { + "epoch": 0.2755112705988921, + "grad_norm": 0.7120384308434919, + "learning_rate": 5.891839076189814e-07, + "loss": 1.4073, + "step": 3954 + }, + { + "epoch": 0.2755809497265094, + "grad_norm": 0.7563875392467487, + "learning_rate": 5.891313942120056e-07, + "loss": 1.4677, + "step": 3955 + }, + { + "epoch": 0.2756506288541267, + "grad_norm": 0.6614944369011492, + "learning_rate": 5.890788710226344e-07, + "loss": 1.5537, + "step": 3956 + }, + { + "epoch": 0.2757203079817441, + "grad_norm": 0.6610667999394185, + "learning_rate": 5.890263380533848e-07, + "loss": 1.4159, + "step": 3957 + }, + { + "epoch": 0.27578998710936137, + "grad_norm": 0.7281030442813066, + "learning_rate": 5.889737953067742e-07, + "loss": 1.4733, + "step": 3958 + }, + { + "epoch": 0.2758596662369787, + "grad_norm": 0.7187232399212806, + "learning_rate": 5.889212427853205e-07, + "loss": 1.5881, + "step": 3959 + }, + { + "epoch": 0.275929345364596, + "grad_norm": 0.7783928113723274, + "learning_rate": 5.888686804915423e-07, + "loss": 1.7394, + "step": 3960 + }, + { + "epoch": 0.27599902449221336, + "grad_norm": 0.7800510926427071, + "learning_rate": 5.888161084279584e-07, + "loss": 1.5735, + "step": 3961 + }, + { + "epoch": 0.27606870361983066, + "grad_norm": 0.6975159879900961, + "learning_rate": 5.887635265970882e-07, + "loss": 1.5884, + "step": 3962 + }, + { + "epoch": 0.276138382747448, + "grad_norm": 0.8555360238849415, + "learning_rate": 5.887109350014513e-07, + "loss": 1.5826, + "step": 3963 + }, + { + "epoch": 0.2762080618750653, + "grad_norm": 0.7028176190290327, + "learning_rate": 5.886583336435683e-07, + "loss": 1.7248, + "step": 3964 + }, + { + "epoch": 0.27627774100268265, + "grad_norm": 0.7432092852493237, + "learning_rate": 5.886057225259598e-07, + "loss": 1.6639, + "step": 3965 + }, + { + "epoch": 0.27634742013029995, + "grad_norm": 0.6896597838445886, + "learning_rate": 5.885531016511472e-07, + "loss": 1.447, + "step": 3966 + }, + { + "epoch": 0.2764170992579173, + "grad_norm": 0.720433857116974, + "learning_rate": 5.885004710216519e-07, + "loss": 1.6086, + "step": 3967 + }, + { + "epoch": 0.2764867783855346, + "grad_norm": 0.7668175443909877, + "learning_rate": 5.884478306399965e-07, + "loss": 1.5574, + "step": 3968 + }, + { + "epoch": 0.27655645751315194, + "grad_norm": 0.7209226325400172, + "learning_rate": 5.883951805087033e-07, + "loss": 1.6687, + "step": 3969 + }, + { + "epoch": 0.27662613664076924, + "grad_norm": 0.714311583829575, + "learning_rate": 5.883425206302952e-07, + "loss": 1.5147, + "step": 3970 + }, + { + "epoch": 0.2766958157683866, + "grad_norm": 0.6742840641439894, + "learning_rate": 5.882898510072964e-07, + "loss": 1.4254, + "step": 3971 + }, + { + "epoch": 0.2767654948960039, + "grad_norm": 0.7348778247791798, + "learning_rate": 5.882371716422306e-07, + "loss": 1.4907, + "step": 3972 + }, + { + "epoch": 0.27683517402362123, + "grad_norm": 0.6579709981742292, + "learning_rate": 5.88184482537622e-07, + "loss": 1.4232, + "step": 3973 + }, + { + "epoch": 0.27690485315123853, + "grad_norm": 0.7227626916827766, + "learning_rate": 5.881317836959961e-07, + "loss": 1.4126, + "step": 3974 + }, + { + "epoch": 0.2769745322788559, + "grad_norm": 0.699676254107354, + "learning_rate": 5.880790751198782e-07, + "loss": 1.564, + "step": 3975 + }, + { + "epoch": 0.2770442114064732, + "grad_norm": 0.6913603310218001, + "learning_rate": 5.880263568117939e-07, + "loss": 1.6078, + "step": 3976 + }, + { + "epoch": 0.2771138905340905, + "grad_norm": 0.6826484309669981, + "learning_rate": 5.879736287742698e-07, + "loss": 1.6187, + "step": 3977 + }, + { + "epoch": 0.2771835696617078, + "grad_norm": 0.7649106951889587, + "learning_rate": 5.879208910098327e-07, + "loss": 1.6896, + "step": 3978 + }, + { + "epoch": 0.27725324878932517, + "grad_norm": 0.686641906084249, + "learning_rate": 5.878681435210099e-07, + "loss": 1.4748, + "step": 3979 + }, + { + "epoch": 0.27732292791694246, + "grad_norm": 0.7413788736872214, + "learning_rate": 5.878153863103294e-07, + "loss": 1.6367, + "step": 3980 + }, + { + "epoch": 0.2773926070445598, + "grad_norm": 0.7146264681803929, + "learning_rate": 5.87762619380319e-07, + "loss": 1.423, + "step": 3981 + }, + { + "epoch": 0.2774622861721771, + "grad_norm": 0.7193547975548729, + "learning_rate": 5.877098427335077e-07, + "loss": 1.5612, + "step": 3982 + }, + { + "epoch": 0.27753196529979446, + "grad_norm": 0.6911172013518755, + "learning_rate": 5.876570563724246e-07, + "loss": 1.6294, + "step": 3983 + }, + { + "epoch": 0.27760164442741175, + "grad_norm": 0.7974987700336611, + "learning_rate": 5.876042602995991e-07, + "loss": 1.5826, + "step": 3984 + }, + { + "epoch": 0.2776713235550291, + "grad_norm": 0.7523075392775425, + "learning_rate": 5.875514545175619e-07, + "loss": 1.4486, + "step": 3985 + }, + { + "epoch": 0.2777410026826464, + "grad_norm": 0.7105791555841411, + "learning_rate": 5.874986390288428e-07, + "loss": 1.526, + "step": 3986 + }, + { + "epoch": 0.27781068181026375, + "grad_norm": 0.6911026667088176, + "learning_rate": 5.874458138359734e-07, + "loss": 1.4773, + "step": 3987 + }, + { + "epoch": 0.27788036093788104, + "grad_norm": 0.769290264633232, + "learning_rate": 5.873929789414849e-07, + "loss": 1.5241, + "step": 3988 + }, + { + "epoch": 0.2779500400654984, + "grad_norm": 0.7393921952918916, + "learning_rate": 5.873401343479093e-07, + "loss": 1.5214, + "step": 3989 + }, + { + "epoch": 0.2780197191931157, + "grad_norm": 0.6990692068762214, + "learning_rate": 5.872872800577792e-07, + "loss": 1.4944, + "step": 3990 + }, + { + "epoch": 0.27808939832073304, + "grad_norm": 0.7482070834918554, + "learning_rate": 5.872344160736273e-07, + "loss": 1.4291, + "step": 3991 + }, + { + "epoch": 0.27815907744835033, + "grad_norm": 0.7684926653711424, + "learning_rate": 5.87181542397987e-07, + "loss": 1.449, + "step": 3992 + }, + { + "epoch": 0.2782287565759677, + "grad_norm": 0.7400846004615286, + "learning_rate": 5.871286590333921e-07, + "loss": 1.571, + "step": 3993 + }, + { + "epoch": 0.278298435703585, + "grad_norm": 0.7117553166829831, + "learning_rate": 5.870757659823769e-07, + "loss": 1.5885, + "step": 3994 + }, + { + "epoch": 0.27836811483120233, + "grad_norm": 0.7295296260321572, + "learning_rate": 5.870228632474761e-07, + "loss": 1.573, + "step": 3995 + }, + { + "epoch": 0.2784377939588196, + "grad_norm": 0.7096933784710855, + "learning_rate": 5.869699508312251e-07, + "loss": 1.6313, + "step": 3996 + }, + { + "epoch": 0.278507473086437, + "grad_norm": 0.74818849404883, + "learning_rate": 5.869170287361592e-07, + "loss": 1.6776, + "step": 3997 + }, + { + "epoch": 0.27857715221405427, + "grad_norm": 0.6633573767886243, + "learning_rate": 5.868640969648149e-07, + "loss": 1.5791, + "step": 3998 + }, + { + "epoch": 0.2786468313416716, + "grad_norm": 0.7037151501018837, + "learning_rate": 5.868111555197287e-07, + "loss": 1.5421, + "step": 3999 + }, + { + "epoch": 0.2787165104692889, + "grad_norm": 0.7677023462476129, + "learning_rate": 5.867582044034374e-07, + "loss": 1.4824, + "step": 4000 + }, + { + "epoch": 0.27878618959690626, + "grad_norm": 0.6969241390359108, + "learning_rate": 5.86705243618479e-07, + "loss": 1.6216, + "step": 4001 + }, + { + "epoch": 0.27885586872452356, + "grad_norm": 0.6722182109071275, + "learning_rate": 5.86652273167391e-07, + "loss": 1.5251, + "step": 4002 + }, + { + "epoch": 0.2789255478521409, + "grad_norm": 0.7489623752803286, + "learning_rate": 5.865992930527123e-07, + "loss": 1.4817, + "step": 4003 + }, + { + "epoch": 0.2789952269797582, + "grad_norm": 0.7007812459698798, + "learning_rate": 5.865463032769814e-07, + "loss": 1.5225, + "step": 4004 + }, + { + "epoch": 0.27906490610737555, + "grad_norm": 0.7121219333031956, + "learning_rate": 5.86493303842738e-07, + "loss": 1.5117, + "step": 4005 + }, + { + "epoch": 0.27913458523499285, + "grad_norm": 0.6799027790844323, + "learning_rate": 5.864402947525218e-07, + "loss": 1.5251, + "step": 4006 + }, + { + "epoch": 0.2792042643626102, + "grad_norm": 0.7083461002334056, + "learning_rate": 5.863872760088732e-07, + "loss": 1.526, + "step": 4007 + }, + { + "epoch": 0.2792739434902275, + "grad_norm": 0.6869380534638632, + "learning_rate": 5.863342476143329e-07, + "loss": 1.548, + "step": 4008 + }, + { + "epoch": 0.27934362261784484, + "grad_norm": 0.7917821268802485, + "learning_rate": 5.86281209571442e-07, + "loss": 1.4851, + "step": 4009 + }, + { + "epoch": 0.27941330174546214, + "grad_norm": 0.7288384722578055, + "learning_rate": 5.862281618827423e-07, + "loss": 1.5221, + "step": 4010 + }, + { + "epoch": 0.2794829808730795, + "grad_norm": 0.737623419529946, + "learning_rate": 5.861751045507761e-07, + "loss": 1.5496, + "step": 4011 + }, + { + "epoch": 0.2795526600006968, + "grad_norm": 0.7023565178847206, + "learning_rate": 5.861220375780858e-07, + "loss": 1.6295, + "step": 4012 + }, + { + "epoch": 0.27962233912831413, + "grad_norm": 0.8024061445348584, + "learning_rate": 5.860689609672146e-07, + "loss": 1.6185, + "step": 4013 + }, + { + "epoch": 0.27969201825593143, + "grad_norm": 0.7407329543983926, + "learning_rate": 5.86015874720706e-07, + "loss": 1.6296, + "step": 4014 + }, + { + "epoch": 0.2797616973835488, + "grad_norm": 0.656469116523215, + "learning_rate": 5.85962778841104e-07, + "loss": 1.4228, + "step": 4015 + }, + { + "epoch": 0.2798313765111661, + "grad_norm": 0.6946187772623037, + "learning_rate": 5.85909673330953e-07, + "loss": 1.4918, + "step": 4016 + }, + { + "epoch": 0.2799010556387834, + "grad_norm": 0.7373641595783593, + "learning_rate": 5.858565581927981e-07, + "loss": 1.6121, + "step": 4017 + }, + { + "epoch": 0.2799707347664007, + "grad_norm": 0.7532752424720006, + "learning_rate": 5.858034334291845e-07, + "loss": 1.5603, + "step": 4018 + }, + { + "epoch": 0.28004041389401807, + "grad_norm": 0.7264475887960263, + "learning_rate": 5.857502990426582e-07, + "loss": 1.4764, + "step": 4019 + }, + { + "epoch": 0.28011009302163536, + "grad_norm": 0.798784286364674, + "learning_rate": 5.856971550357653e-07, + "loss": 1.5344, + "step": 4020 + }, + { + "epoch": 0.2801797721492527, + "grad_norm": 0.7176879232704745, + "learning_rate": 5.856440014110529e-07, + "loss": 1.6604, + "step": 4021 + }, + { + "epoch": 0.28024945127687, + "grad_norm": 0.6773894896943677, + "learning_rate": 5.855908381710679e-07, + "loss": 1.4547, + "step": 4022 + }, + { + "epoch": 0.28031913040448736, + "grad_norm": 0.7127957867651743, + "learning_rate": 5.855376653183582e-07, + "loss": 1.5388, + "step": 4023 + }, + { + "epoch": 0.28038880953210465, + "grad_norm": 0.7166579306263381, + "learning_rate": 5.854844828554719e-07, + "loss": 1.5743, + "step": 4024 + }, + { + "epoch": 0.280458488659722, + "grad_norm": 0.7410980916162924, + "learning_rate": 5.854312907849575e-07, + "loss": 1.4147, + "step": 4025 + }, + { + "epoch": 0.2805281677873393, + "grad_norm": 0.7268041723692737, + "learning_rate": 5.853780891093643e-07, + "loss": 1.5175, + "step": 4026 + }, + { + "epoch": 0.28059784691495665, + "grad_norm": 0.7321423766832356, + "learning_rate": 5.853248778312416e-07, + "loss": 1.5852, + "step": 4027 + }, + { + "epoch": 0.28066752604257394, + "grad_norm": 0.6934596891068451, + "learning_rate": 5.852716569531395e-07, + "loss": 1.5311, + "step": 4028 + }, + { + "epoch": 0.2807372051701913, + "grad_norm": 0.7081751885804427, + "learning_rate": 5.852184264776085e-07, + "loss": 1.5793, + "step": 4029 + }, + { + "epoch": 0.2808068842978086, + "grad_norm": 0.715593124627546, + "learning_rate": 5.851651864071994e-07, + "loss": 1.5773, + "step": 4030 + }, + { + "epoch": 0.28087656342542594, + "grad_norm": 0.8029234811798415, + "learning_rate": 5.851119367444636e-07, + "loss": 1.5563, + "step": 4031 + }, + { + "epoch": 0.28094624255304324, + "grad_norm": 0.7836241719894106, + "learning_rate": 5.850586774919531e-07, + "loss": 1.5504, + "step": 4032 + }, + { + "epoch": 0.2810159216806606, + "grad_norm": 0.7444625174947146, + "learning_rate": 5.8500540865222e-07, + "loss": 1.5774, + "step": 4033 + }, + { + "epoch": 0.2810856008082779, + "grad_norm": 0.7682038540781259, + "learning_rate": 5.849521302278171e-07, + "loss": 1.6068, + "step": 4034 + }, + { + "epoch": 0.2811552799358952, + "grad_norm": 0.6977564285986588, + "learning_rate": 5.848988422212977e-07, + "loss": 1.539, + "step": 4035 + }, + { + "epoch": 0.2812249590635125, + "grad_norm": 0.6861978787885887, + "learning_rate": 5.848455446352152e-07, + "loss": 1.5767, + "step": 4036 + }, + { + "epoch": 0.2812946381911298, + "grad_norm": 0.7061641280191499, + "learning_rate": 5.847922374721241e-07, + "loss": 1.5446, + "step": 4037 + }, + { + "epoch": 0.28136431731874717, + "grad_norm": 0.7162478499272291, + "learning_rate": 5.847389207345788e-07, + "loss": 1.5821, + "step": 4038 + }, + { + "epoch": 0.28143399644636446, + "grad_norm": 0.704977072425101, + "learning_rate": 5.846855944251343e-07, + "loss": 1.6258, + "step": 4039 + }, + { + "epoch": 0.2815036755739818, + "grad_norm": 0.7989239527820603, + "learning_rate": 5.846322585463462e-07, + "loss": 1.5252, + "step": 4040 + }, + { + "epoch": 0.2815733547015991, + "grad_norm": 0.7620500570883405, + "learning_rate": 5.845789131007705e-07, + "loss": 1.4945, + "step": 4041 + }, + { + "epoch": 0.28164303382921646, + "grad_norm": 0.7065137235650111, + "learning_rate": 5.845255580909634e-07, + "loss": 1.5273, + "step": 4042 + }, + { + "epoch": 0.28171271295683376, + "grad_norm": 0.72292442941867, + "learning_rate": 5.844721935194821e-07, + "loss": 1.6439, + "step": 4043 + }, + { + "epoch": 0.2817823920844511, + "grad_norm": 0.7135494906603791, + "learning_rate": 5.844188193888838e-07, + "loss": 1.6209, + "step": 4044 + }, + { + "epoch": 0.2818520712120684, + "grad_norm": 0.7238594633780343, + "learning_rate": 5.843654357017261e-07, + "loss": 1.6056, + "step": 4045 + }, + { + "epoch": 0.28192175033968575, + "grad_norm": 0.7195992753862425, + "learning_rate": 5.843120424605675e-07, + "loss": 1.6093, + "step": 4046 + }, + { + "epoch": 0.28199142946730305, + "grad_norm": 0.7102552233590842, + "learning_rate": 5.842586396679666e-07, + "loss": 1.5801, + "step": 4047 + }, + { + "epoch": 0.2820611085949204, + "grad_norm": 0.7237963977095361, + "learning_rate": 5.842052273264828e-07, + "loss": 1.599, + "step": 4048 + }, + { + "epoch": 0.2821307877225377, + "grad_norm": 0.7388017933797084, + "learning_rate": 5.841518054386754e-07, + "loss": 1.4611, + "step": 4049 + }, + { + "epoch": 0.28220046685015504, + "grad_norm": 0.7169800257321843, + "learning_rate": 5.840983740071046e-07, + "loss": 1.5368, + "step": 4050 + }, + { + "epoch": 0.28227014597777234, + "grad_norm": 0.7591467618718327, + "learning_rate": 5.840449330343311e-07, + "loss": 1.4682, + "step": 4051 + }, + { + "epoch": 0.2823398251053897, + "grad_norm": 0.7816992052749729, + "learning_rate": 5.839914825229157e-07, + "loss": 1.5462, + "step": 4052 + }, + { + "epoch": 0.282409504233007, + "grad_norm": 0.7678402184087277, + "learning_rate": 5.839380224754199e-07, + "loss": 1.411, + "step": 4053 + }, + { + "epoch": 0.28247918336062433, + "grad_norm": 0.7008928225544877, + "learning_rate": 5.838845528944057e-07, + "loss": 1.5995, + "step": 4054 + }, + { + "epoch": 0.2825488624882416, + "grad_norm": 0.7036680440880047, + "learning_rate": 5.838310737824353e-07, + "loss": 1.5138, + "step": 4055 + }, + { + "epoch": 0.282618541615859, + "grad_norm": 0.6918801095209847, + "learning_rate": 5.837775851420719e-07, + "loss": 1.5122, + "step": 4056 + }, + { + "epoch": 0.28268822074347627, + "grad_norm": 0.710151105832471, + "learning_rate": 5.837240869758785e-07, + "loss": 1.6328, + "step": 4057 + }, + { + "epoch": 0.2827578998710936, + "grad_norm": 6.128195129119697, + "learning_rate": 5.836705792864187e-07, + "loss": 1.4178, + "step": 4058 + }, + { + "epoch": 0.2828275789987109, + "grad_norm": 0.6829741022309734, + "learning_rate": 5.83617062076257e-07, + "loss": 1.5073, + "step": 4059 + }, + { + "epoch": 0.28289725812632827, + "grad_norm": 0.715180603423623, + "learning_rate": 5.835635353479579e-07, + "loss": 1.7047, + "step": 4060 + }, + { + "epoch": 0.28296693725394556, + "grad_norm": 0.7240989089434184, + "learning_rate": 5.835099991040865e-07, + "loss": 1.6419, + "step": 4061 + }, + { + "epoch": 0.2830366163815629, + "grad_norm": 1.4884192988469906, + "learning_rate": 5.834564533472084e-07, + "loss": 1.5715, + "step": 4062 + }, + { + "epoch": 0.2831062955091802, + "grad_norm": 0.6869430028687137, + "learning_rate": 5.834028980798897e-07, + "loss": 1.6057, + "step": 4063 + }, + { + "epoch": 0.28317597463679756, + "grad_norm": 0.7492810286800211, + "learning_rate": 5.833493333046969e-07, + "loss": 1.5047, + "step": 4064 + }, + { + "epoch": 0.28324565376441485, + "grad_norm": 0.6528949631329686, + "learning_rate": 5.832957590241967e-07, + "loss": 1.4532, + "step": 4065 + }, + { + "epoch": 0.2833153328920322, + "grad_norm": 0.69702222687072, + "learning_rate": 5.832421752409567e-07, + "loss": 1.5506, + "step": 4066 + }, + { + "epoch": 0.2833850120196495, + "grad_norm": 0.7244086971828632, + "learning_rate": 5.831885819575447e-07, + "loss": 1.5833, + "step": 4067 + }, + { + "epoch": 0.28345469114726685, + "grad_norm": 0.7019005426267827, + "learning_rate": 5.831349791765289e-07, + "loss": 1.4391, + "step": 4068 + }, + { + "epoch": 0.28352437027488414, + "grad_norm": 0.7200978367651489, + "learning_rate": 5.830813669004781e-07, + "loss": 1.477, + "step": 4069 + }, + { + "epoch": 0.2835940494025015, + "grad_norm": 0.6700016328199216, + "learning_rate": 5.830277451319616e-07, + "loss": 1.4821, + "step": 4070 + }, + { + "epoch": 0.2836637285301188, + "grad_norm": 0.7013166629088055, + "learning_rate": 5.829741138735491e-07, + "loss": 1.5592, + "step": 4071 + }, + { + "epoch": 0.28373340765773614, + "grad_norm": 0.7464737988751529, + "learning_rate": 5.829204731278105e-07, + "loss": 1.5095, + "step": 4072 + }, + { + "epoch": 0.28380308678535343, + "grad_norm": 0.7175291663947816, + "learning_rate": 5.828668228973166e-07, + "loss": 1.6289, + "step": 4073 + }, + { + "epoch": 0.2838727659129708, + "grad_norm": 0.772606784486943, + "learning_rate": 5.828131631846383e-07, + "loss": 1.4315, + "step": 4074 + }, + { + "epoch": 0.2839424450405881, + "grad_norm": 0.6821940080682076, + "learning_rate": 5.82759493992347e-07, + "loss": 1.603, + "step": 4075 + }, + { + "epoch": 0.2840121241682054, + "grad_norm": 0.6781925194254677, + "learning_rate": 5.827058153230149e-07, + "loss": 1.4863, + "step": 4076 + }, + { + "epoch": 0.2840818032958227, + "grad_norm": 0.7357770305547745, + "learning_rate": 5.826521271792142e-07, + "loss": 1.6178, + "step": 4077 + }, + { + "epoch": 0.28415148242344007, + "grad_norm": 0.6701447160402433, + "learning_rate": 5.825984295635178e-07, + "loss": 1.4617, + "step": 4078 + }, + { + "epoch": 0.28422116155105737, + "grad_norm": 0.7456256029388817, + "learning_rate": 5.82544722478499e-07, + "loss": 1.5877, + "step": 4079 + }, + { + "epoch": 0.2842908406786747, + "grad_norm": 0.7150322734011164, + "learning_rate": 5.824910059267316e-07, + "loss": 1.4595, + "step": 4080 + }, + { + "epoch": 0.284360519806292, + "grad_norm": 0.7762158243973858, + "learning_rate": 5.824372799107898e-07, + "loss": 1.4091, + "step": 4081 + }, + { + "epoch": 0.28443019893390936, + "grad_norm": 0.6999482983696204, + "learning_rate": 5.823835444332481e-07, + "loss": 1.5803, + "step": 4082 + }, + { + "epoch": 0.28449987806152666, + "grad_norm": 0.7179465348485711, + "learning_rate": 5.823297994966817e-07, + "loss": 1.5976, + "step": 4083 + }, + { + "epoch": 0.284569557189144, + "grad_norm": 0.6886196672918611, + "learning_rate": 5.822760451036663e-07, + "loss": 1.4657, + "step": 4084 + }, + { + "epoch": 0.2846392363167613, + "grad_norm": 0.7343911706726971, + "learning_rate": 5.822222812567777e-07, + "loss": 1.5281, + "step": 4085 + }, + { + "epoch": 0.28470891544437865, + "grad_norm": 0.7747706894286983, + "learning_rate": 5.821685079585925e-07, + "loss": 1.6097, + "step": 4086 + }, + { + "epoch": 0.28477859457199595, + "grad_norm": 0.7459684826116418, + "learning_rate": 5.821147252116877e-07, + "loss": 1.6015, + "step": 4087 + }, + { + "epoch": 0.2848482736996133, + "grad_norm": 0.7068888527292315, + "learning_rate": 5.820609330186406e-07, + "loss": 1.552, + "step": 4088 + }, + { + "epoch": 0.2849179528272306, + "grad_norm": 0.6849166782257857, + "learning_rate": 5.82007131382029e-07, + "loss": 1.6113, + "step": 4089 + }, + { + "epoch": 0.28498763195484794, + "grad_norm": 0.7440776028099806, + "learning_rate": 5.819533203044312e-07, + "loss": 1.8013, + "step": 4090 + }, + { + "epoch": 0.28505731108246524, + "grad_norm": 0.6927799473369971, + "learning_rate": 5.81899499788426e-07, + "loss": 1.5277, + "step": 4091 + }, + { + "epoch": 0.2851269902100826, + "grad_norm": 0.7629208548117576, + "learning_rate": 5.818456698365925e-07, + "loss": 1.5684, + "step": 4092 + }, + { + "epoch": 0.2851966693376999, + "grad_norm": 0.7884306874549732, + "learning_rate": 5.817918304515104e-07, + "loss": 1.5751, + "step": 4093 + }, + { + "epoch": 0.28526634846531723, + "grad_norm": 0.7391133979352249, + "learning_rate": 5.817379816357597e-07, + "loss": 1.5001, + "step": 4094 + }, + { + "epoch": 0.2853360275929345, + "grad_norm": 0.6698145354407297, + "learning_rate": 5.816841233919212e-07, + "loss": 1.4165, + "step": 4095 + }, + { + "epoch": 0.2854057067205519, + "grad_norm": 0.792643478722233, + "learning_rate": 5.816302557225756e-07, + "loss": 1.5447, + "step": 4096 + }, + { + "epoch": 0.28547538584816917, + "grad_norm": 0.7136178558225388, + "learning_rate": 5.815763786303045e-07, + "loss": 1.5373, + "step": 4097 + }, + { + "epoch": 0.2855450649757865, + "grad_norm": 0.7575039417167665, + "learning_rate": 5.815224921176897e-07, + "loss": 1.6437, + "step": 4098 + }, + { + "epoch": 0.2856147441034038, + "grad_norm": 0.7584843652406438, + "learning_rate": 5.814685961873138e-07, + "loss": 1.583, + "step": 4099 + }, + { + "epoch": 0.28568442323102117, + "grad_norm": 0.7576203179121972, + "learning_rate": 5.814146908417594e-07, + "loss": 1.5119, + "step": 4100 + }, + { + "epoch": 0.28575410235863846, + "grad_norm": 0.7598044895034622, + "learning_rate": 5.813607760836097e-07, + "loss": 1.574, + "step": 4101 + }, + { + "epoch": 0.2858237814862558, + "grad_norm": 0.6735244316784529, + "learning_rate": 5.813068519154485e-07, + "loss": 1.5738, + "step": 4102 + }, + { + "epoch": 0.2858934606138731, + "grad_norm": 0.6868829077636657, + "learning_rate": 5.812529183398598e-07, + "loss": 1.5322, + "step": 4103 + }, + { + "epoch": 0.28596313974149046, + "grad_norm": 0.7419971118801489, + "learning_rate": 5.811989753594286e-07, + "loss": 1.5351, + "step": 4104 + }, + { + "epoch": 0.28603281886910775, + "grad_norm": 0.7270043780129994, + "learning_rate": 5.811450229767396e-07, + "loss": 1.3956, + "step": 4105 + }, + { + "epoch": 0.2861024979967251, + "grad_norm": 0.7054461678186529, + "learning_rate": 5.810910611943784e-07, + "loss": 1.5752, + "step": 4106 + }, + { + "epoch": 0.2861721771243424, + "grad_norm": 0.8449176564524208, + "learning_rate": 5.810370900149311e-07, + "loss": 1.5074, + "step": 4107 + }, + { + "epoch": 0.28624185625195975, + "grad_norm": 0.7140677921475737, + "learning_rate": 5.809831094409838e-07, + "loss": 1.5529, + "step": 4108 + }, + { + "epoch": 0.28631153537957704, + "grad_norm": 0.7074402998095285, + "learning_rate": 5.809291194751236e-07, + "loss": 1.4844, + "step": 4109 + }, + { + "epoch": 0.2863812145071944, + "grad_norm": 0.7303614882913259, + "learning_rate": 5.808751201199379e-07, + "loss": 1.5742, + "step": 4110 + }, + { + "epoch": 0.2864508936348117, + "grad_norm": 0.7183991860832905, + "learning_rate": 5.808211113780142e-07, + "loss": 1.4775, + "step": 4111 + }, + { + "epoch": 0.28652057276242904, + "grad_norm": 0.8003586154955813, + "learning_rate": 5.807670932519409e-07, + "loss": 1.5617, + "step": 4112 + }, + { + "epoch": 0.28659025189004633, + "grad_norm": 0.7474000305808857, + "learning_rate": 5.807130657443066e-07, + "loss": 1.4609, + "step": 4113 + }, + { + "epoch": 0.2866599310176637, + "grad_norm": 0.6674786568948675, + "learning_rate": 5.806590288577002e-07, + "loss": 1.473, + "step": 4114 + }, + { + "epoch": 0.286729610145281, + "grad_norm": 0.7505481394579682, + "learning_rate": 5.806049825947117e-07, + "loss": 1.5761, + "step": 4115 + }, + { + "epoch": 0.28679928927289833, + "grad_norm": 0.7296521930923773, + "learning_rate": 5.805509269579308e-07, + "loss": 1.4665, + "step": 4116 + }, + { + "epoch": 0.2868689684005156, + "grad_norm": 0.687910460173145, + "learning_rate": 5.804968619499479e-07, + "loss": 1.5137, + "step": 4117 + }, + { + "epoch": 0.286938647528133, + "grad_norm": 0.7760861013120258, + "learning_rate": 5.804427875733541e-07, + "loss": 1.5681, + "step": 4118 + }, + { + "epoch": 0.28700832665575027, + "grad_norm": 0.6496291087556836, + "learning_rate": 5.803887038307407e-07, + "loss": 1.4863, + "step": 4119 + }, + { + "epoch": 0.2870780057833676, + "grad_norm": 0.6963744044109444, + "learning_rate": 5.803346107246995e-07, + "loss": 1.4003, + "step": 4120 + }, + { + "epoch": 0.2871476849109849, + "grad_norm": 0.7441529427156599, + "learning_rate": 5.802805082578228e-07, + "loss": 1.4934, + "step": 4121 + }, + { + "epoch": 0.28721736403860226, + "grad_norm": 0.6879418056717342, + "learning_rate": 5.802263964327031e-07, + "loss": 1.4888, + "step": 4122 + }, + { + "epoch": 0.28728704316621956, + "grad_norm": 0.7485835173848666, + "learning_rate": 5.801722752519338e-07, + "loss": 1.5704, + "step": 4123 + }, + { + "epoch": 0.2873567222938369, + "grad_norm": 0.702037064253478, + "learning_rate": 5.801181447181083e-07, + "loss": 1.5496, + "step": 4124 + }, + { + "epoch": 0.2874264014214542, + "grad_norm": 0.7026946239503448, + "learning_rate": 5.800640048338209e-07, + "loss": 1.5774, + "step": 4125 + }, + { + "epoch": 0.28749608054907155, + "grad_norm": 0.7491119945124249, + "learning_rate": 5.800098556016658e-07, + "loss": 1.4153, + "step": 4126 + }, + { + "epoch": 0.28756575967668885, + "grad_norm": 0.6902498898799309, + "learning_rate": 5.79955697024238e-07, + "loss": 1.5892, + "step": 4127 + }, + { + "epoch": 0.28763543880430614, + "grad_norm": 0.7063696011417466, + "learning_rate": 5.79901529104133e-07, + "loss": 1.5036, + "step": 4128 + }, + { + "epoch": 0.2877051179319235, + "grad_norm": 0.6764332043828444, + "learning_rate": 5.798473518439467e-07, + "loss": 1.4542, + "step": 4129 + }, + { + "epoch": 0.2877747970595408, + "grad_norm": 0.7574873462504658, + "learning_rate": 5.797931652462752e-07, + "loss": 1.5444, + "step": 4130 + }, + { + "epoch": 0.28784447618715814, + "grad_norm": 0.6919679634888707, + "learning_rate": 5.797389693137154e-07, + "loss": 1.5891, + "step": 4131 + }, + { + "epoch": 0.28791415531477543, + "grad_norm": 0.7380176334601344, + "learning_rate": 5.796847640488644e-07, + "loss": 1.6671, + "step": 4132 + }, + { + "epoch": 0.2879838344423928, + "grad_norm": 0.8050340076019923, + "learning_rate": 5.796305494543197e-07, + "loss": 1.5894, + "step": 4133 + }, + { + "epoch": 0.2880535135700101, + "grad_norm": 0.7487002362938737, + "learning_rate": 5.795763255326796e-07, + "loss": 1.4812, + "step": 4134 + }, + { + "epoch": 0.28812319269762743, + "grad_norm": 0.7369552008147401, + "learning_rate": 5.795220922865426e-07, + "loss": 1.4874, + "step": 4135 + }, + { + "epoch": 0.2881928718252447, + "grad_norm": 5.4765373368585175, + "learning_rate": 5.794678497185075e-07, + "loss": 1.5605, + "step": 4136 + }, + { + "epoch": 0.2882625509528621, + "grad_norm": 0.7284276944673779, + "learning_rate": 5.794135978311737e-07, + "loss": 1.5034, + "step": 4137 + }, + { + "epoch": 0.28833223008047937, + "grad_norm": 0.7995026203041848, + "learning_rate": 5.793593366271413e-07, + "loss": 1.5816, + "step": 4138 + }, + { + "epoch": 0.2884019092080967, + "grad_norm": 0.7179892897013607, + "learning_rate": 5.793050661090105e-07, + "loss": 1.5214, + "step": 4139 + }, + { + "epoch": 0.288471588335714, + "grad_norm": 0.7446689591066641, + "learning_rate": 5.79250786279382e-07, + "loss": 1.6026, + "step": 4140 + }, + { + "epoch": 0.28854126746333136, + "grad_norm": 0.7414451978063824, + "learning_rate": 5.791964971408569e-07, + "loss": 1.507, + "step": 4141 + }, + { + "epoch": 0.28861094659094866, + "grad_norm": 0.7118717774250654, + "learning_rate": 5.791421986960371e-07, + "loss": 1.4607, + "step": 4142 + }, + { + "epoch": 0.288680625718566, + "grad_norm": 0.7274986192626761, + "learning_rate": 5.790878909475246e-07, + "loss": 1.6512, + "step": 4143 + }, + { + "epoch": 0.2887503048461833, + "grad_norm": 0.7470589216705154, + "learning_rate": 5.790335738979218e-07, + "loss": 1.5379, + "step": 4144 + }, + { + "epoch": 0.28881998397380065, + "grad_norm": 0.7192847739232004, + "learning_rate": 5.789792475498319e-07, + "loss": 1.5424, + "step": 4145 + }, + { + "epoch": 0.28888966310141795, + "grad_norm": 0.7808717081028412, + "learning_rate": 5.789249119058582e-07, + "loss": 1.5689, + "step": 4146 + }, + { + "epoch": 0.2889593422290353, + "grad_norm": 0.7165423217935094, + "learning_rate": 5.788705669686047e-07, + "loss": 1.5309, + "step": 4147 + }, + { + "epoch": 0.2890290213566526, + "grad_norm": 0.7413403373071283, + "learning_rate": 5.788162127406755e-07, + "loss": 1.6779, + "step": 4148 + }, + { + "epoch": 0.28909870048426994, + "grad_norm": 0.7088883743890833, + "learning_rate": 5.787618492246754e-07, + "loss": 1.638, + "step": 4149 + }, + { + "epoch": 0.28916837961188724, + "grad_norm": 0.6863263458241512, + "learning_rate": 5.787074764232098e-07, + "loss": 1.5491, + "step": 4150 + }, + { + "epoch": 0.2892380587395046, + "grad_norm": 0.7306183676069988, + "learning_rate": 5.786530943388843e-07, + "loss": 1.5322, + "step": 4151 + }, + { + "epoch": 0.2893077378671219, + "grad_norm": 0.718996793284283, + "learning_rate": 5.78598702974305e-07, + "loss": 1.567, + "step": 4152 + }, + { + "epoch": 0.28937741699473923, + "grad_norm": 0.7430070802419222, + "learning_rate": 5.785443023320782e-07, + "loss": 1.5341, + "step": 4153 + }, + { + "epoch": 0.28944709612235653, + "grad_norm": 0.7173099226439454, + "learning_rate": 5.784898924148112e-07, + "loss": 1.5728, + "step": 4154 + }, + { + "epoch": 0.2895167752499739, + "grad_norm": 0.7298294434551115, + "learning_rate": 5.784354732251114e-07, + "loss": 1.5361, + "step": 4155 + }, + { + "epoch": 0.2895864543775912, + "grad_norm": 0.7063577470300629, + "learning_rate": 5.783810447655865e-07, + "loss": 1.5852, + "step": 4156 + }, + { + "epoch": 0.2896561335052085, + "grad_norm": 0.746458774354094, + "learning_rate": 5.78326607038845e-07, + "loss": 1.5576, + "step": 4157 + }, + { + "epoch": 0.2897258126328258, + "grad_norm": 0.7408594387198453, + "learning_rate": 5.782721600474956e-07, + "loss": 1.5814, + "step": 4158 + }, + { + "epoch": 0.28979549176044317, + "grad_norm": 0.7430836064324008, + "learning_rate": 5.782177037941475e-07, + "loss": 1.467, + "step": 4159 + }, + { + "epoch": 0.28986517088806046, + "grad_norm": 0.7198430677931947, + "learning_rate": 5.781632382814104e-07, + "loss": 1.4687, + "step": 4160 + }, + { + "epoch": 0.2899348500156778, + "grad_norm": 0.7252227953435738, + "learning_rate": 5.781087635118942e-07, + "loss": 1.6479, + "step": 4161 + }, + { + "epoch": 0.2900045291432951, + "grad_norm": 0.686562113644517, + "learning_rate": 5.780542794882098e-07, + "loss": 1.482, + "step": 4162 + }, + { + "epoch": 0.29007420827091246, + "grad_norm": 0.7181583474767024, + "learning_rate": 5.779997862129679e-07, + "loss": 1.5554, + "step": 4163 + }, + { + "epoch": 0.29014388739852975, + "grad_norm": 0.7422299136780489, + "learning_rate": 5.779452836887801e-07, + "loss": 1.6733, + "step": 4164 + }, + { + "epoch": 0.2902135665261471, + "grad_norm": 0.6927016820321847, + "learning_rate": 5.77890771918258e-07, + "loss": 1.5271, + "step": 4165 + }, + { + "epoch": 0.2902832456537644, + "grad_norm": 0.6829369165723617, + "learning_rate": 5.778362509040143e-07, + "loss": 1.5014, + "step": 4166 + }, + { + "epoch": 0.29035292478138175, + "grad_norm": 0.6620607702711369, + "learning_rate": 5.777817206486616e-07, + "loss": 1.6308, + "step": 4167 + }, + { + "epoch": 0.29042260390899904, + "grad_norm": 0.7073017154807985, + "learning_rate": 5.77727181154813e-07, + "loss": 1.5599, + "step": 4168 + }, + { + "epoch": 0.2904922830366164, + "grad_norm": 0.8033268829212622, + "learning_rate": 5.776726324250822e-07, + "loss": 1.5079, + "step": 4169 + }, + { + "epoch": 0.2905619621642337, + "grad_norm": 0.7209486887046835, + "learning_rate": 5.776180744620833e-07, + "loss": 1.5049, + "step": 4170 + }, + { + "epoch": 0.29063164129185104, + "grad_norm": 0.8022012464666788, + "learning_rate": 5.775635072684308e-07, + "loss": 1.4163, + "step": 4171 + }, + { + "epoch": 0.29070132041946833, + "grad_norm": 0.7277687745750111, + "learning_rate": 5.775089308467398e-07, + "loss": 1.5629, + "step": 4172 + }, + { + "epoch": 0.2907709995470857, + "grad_norm": 0.783412324097584, + "learning_rate": 5.774543451996256e-07, + "loss": 1.6353, + "step": 4173 + }, + { + "epoch": 0.290840678674703, + "grad_norm": 0.7209177364651561, + "learning_rate": 5.773997503297041e-07, + "loss": 1.5835, + "step": 4174 + }, + { + "epoch": 0.29091035780232033, + "grad_norm": 0.7199756416577356, + "learning_rate": 5.773451462395915e-07, + "loss": 1.5281, + "step": 4175 + }, + { + "epoch": 0.2909800369299376, + "grad_norm": 0.7322879678085863, + "learning_rate": 5.772905329319047e-07, + "loss": 1.3603, + "step": 4176 + }, + { + "epoch": 0.291049716057555, + "grad_norm": 0.6590412334963223, + "learning_rate": 5.772359104092607e-07, + "loss": 1.4824, + "step": 4177 + }, + { + "epoch": 0.29111939518517227, + "grad_norm": 0.7445913105232103, + "learning_rate": 5.771812786742773e-07, + "loss": 1.5666, + "step": 4178 + }, + { + "epoch": 0.2911890743127896, + "grad_norm": 0.7165402829789416, + "learning_rate": 5.771266377295725e-07, + "loss": 1.5071, + "step": 4179 + }, + { + "epoch": 0.2912587534404069, + "grad_norm": 0.6794848939177912, + "learning_rate": 5.770719875777647e-07, + "loss": 1.4203, + "step": 4180 + }, + { + "epoch": 0.29132843256802426, + "grad_norm": 0.7532430788398345, + "learning_rate": 5.77017328221473e-07, + "loss": 1.6119, + "step": 4181 + }, + { + "epoch": 0.29139811169564156, + "grad_norm": 0.7048663985537814, + "learning_rate": 5.769626596633167e-07, + "loss": 1.6076, + "step": 4182 + }, + { + "epoch": 0.2914677908232589, + "grad_norm": 0.7260710522617362, + "learning_rate": 5.769079819059156e-07, + "loss": 1.5931, + "step": 4183 + }, + { + "epoch": 0.2915374699508762, + "grad_norm": 0.7085546777892404, + "learning_rate": 5.768532949518901e-07, + "loss": 1.5788, + "step": 4184 + }, + { + "epoch": 0.29160714907849355, + "grad_norm": 0.7351359290443842, + "learning_rate": 5.767985988038609e-07, + "loss": 1.4791, + "step": 4185 + }, + { + "epoch": 0.29167682820611085, + "grad_norm": 0.7345803290205484, + "learning_rate": 5.767438934644489e-07, + "loss": 1.498, + "step": 4186 + }, + { + "epoch": 0.2917465073337282, + "grad_norm": 0.7275728966569317, + "learning_rate": 5.766891789362761e-07, + "loss": 1.5793, + "step": 4187 + }, + { + "epoch": 0.2918161864613455, + "grad_norm": 0.6385572204281225, + "learning_rate": 5.766344552219643e-07, + "loss": 1.5234, + "step": 4188 + }, + { + "epoch": 0.29188586558896285, + "grad_norm": 0.6878039246866837, + "learning_rate": 5.765797223241358e-07, + "loss": 1.529, + "step": 4189 + }, + { + "epoch": 0.29195554471658014, + "grad_norm": 0.7407162221510748, + "learning_rate": 5.765249802454138e-07, + "loss": 1.4297, + "step": 4190 + }, + { + "epoch": 0.2920252238441975, + "grad_norm": 0.6954475217623283, + "learning_rate": 5.764702289884216e-07, + "loss": 1.5887, + "step": 4191 + }, + { + "epoch": 0.2920949029718148, + "grad_norm": 0.7438942748646914, + "learning_rate": 5.764154685557829e-07, + "loss": 1.5754, + "step": 4192 + }, + { + "epoch": 0.29216458209943214, + "grad_norm": 0.7878280970340501, + "learning_rate": 5.763606989501221e-07, + "loss": 1.5281, + "step": 4193 + }, + { + "epoch": 0.29223426122704943, + "grad_norm": 0.6856592601877685, + "learning_rate": 5.763059201740636e-07, + "loss": 1.5176, + "step": 4194 + }, + { + "epoch": 0.2923039403546668, + "grad_norm": 0.8063588131594583, + "learning_rate": 5.762511322302326e-07, + "loss": 1.6245, + "step": 4195 + }, + { + "epoch": 0.2923736194822841, + "grad_norm": 0.7078649971602377, + "learning_rate": 5.761963351212548e-07, + "loss": 1.5293, + "step": 4196 + }, + { + "epoch": 0.2924432986099014, + "grad_norm": 0.6867274360359023, + "learning_rate": 5.761415288497562e-07, + "loss": 1.4691, + "step": 4197 + }, + { + "epoch": 0.2925129777375187, + "grad_norm": 0.6938725129579134, + "learning_rate": 5.76086713418363e-07, + "loss": 1.5709, + "step": 4198 + }, + { + "epoch": 0.29258265686513607, + "grad_norm": 0.6964301335240399, + "learning_rate": 5.760318888297023e-07, + "loss": 1.5175, + "step": 4199 + }, + { + "epoch": 0.29265233599275337, + "grad_norm": 0.7176137893004337, + "learning_rate": 5.759770550864012e-07, + "loss": 1.6088, + "step": 4200 + }, + { + "epoch": 0.2927220151203707, + "grad_norm": 0.6600968939310983, + "learning_rate": 5.759222121910876e-07, + "loss": 1.5252, + "step": 4201 + }, + { + "epoch": 0.292791694247988, + "grad_norm": 0.7713602795893594, + "learning_rate": 5.758673601463897e-07, + "loss": 1.7007, + "step": 4202 + }, + { + "epoch": 0.29286137337560536, + "grad_norm": 0.8521496590271287, + "learning_rate": 5.758124989549359e-07, + "loss": 1.3938, + "step": 4203 + }, + { + "epoch": 0.29293105250322266, + "grad_norm": 0.7064840849897108, + "learning_rate": 5.757576286193557e-07, + "loss": 1.6102, + "step": 4204 + }, + { + "epoch": 0.29300073163084, + "grad_norm": 0.7073473475058616, + "learning_rate": 5.75702749142278e-07, + "loss": 1.4099, + "step": 4205 + }, + { + "epoch": 0.2930704107584573, + "grad_norm": 0.7497971375461359, + "learning_rate": 5.756478605263332e-07, + "loss": 1.59, + "step": 4206 + }, + { + "epoch": 0.29314008988607465, + "grad_norm": 0.7483325900051251, + "learning_rate": 5.755929627741515e-07, + "loss": 1.5683, + "step": 4207 + }, + { + "epoch": 0.29320976901369195, + "grad_norm": 0.677131842423103, + "learning_rate": 5.755380558883638e-07, + "loss": 1.5654, + "step": 4208 + }, + { + "epoch": 0.2932794481413093, + "grad_norm": 0.7090599829679397, + "learning_rate": 5.754831398716012e-07, + "loss": 1.5853, + "step": 4209 + }, + { + "epoch": 0.2933491272689266, + "grad_norm": 0.6531789620922398, + "learning_rate": 5.754282147264955e-07, + "loss": 1.5282, + "step": 4210 + }, + { + "epoch": 0.29341880639654394, + "grad_norm": 0.7290449241279219, + "learning_rate": 5.753732804556789e-07, + "loss": 1.5285, + "step": 4211 + }, + { + "epoch": 0.29348848552416124, + "grad_norm": 0.696260443545441, + "learning_rate": 5.753183370617839e-07, + "loss": 1.5345, + "step": 4212 + }, + { + "epoch": 0.2935581646517786, + "grad_norm": 0.7252444210476912, + "learning_rate": 5.752633845474433e-07, + "loss": 1.5398, + "step": 4213 + }, + { + "epoch": 0.2936278437793959, + "grad_norm": 0.7312914360002516, + "learning_rate": 5.752084229152909e-07, + "loss": 1.5642, + "step": 4214 + }, + { + "epoch": 0.29369752290701323, + "grad_norm": 0.724005882627236, + "learning_rate": 5.751534521679603e-07, + "loss": 1.4495, + "step": 4215 + }, + { + "epoch": 0.2937672020346305, + "grad_norm": 0.7720337660286007, + "learning_rate": 5.750984723080859e-07, + "loss": 1.5852, + "step": 4216 + }, + { + "epoch": 0.2938368811622479, + "grad_norm": 0.6758714480813982, + "learning_rate": 5.750434833383024e-07, + "loss": 1.4688, + "step": 4217 + }, + { + "epoch": 0.29390656028986517, + "grad_norm": 0.6951183763692467, + "learning_rate": 5.74988485261245e-07, + "loss": 1.5259, + "step": 4218 + }, + { + "epoch": 0.29397623941748247, + "grad_norm": 0.6941762057762565, + "learning_rate": 5.749334780795495e-07, + "loss": 1.4688, + "step": 4219 + }, + { + "epoch": 0.2940459185450998, + "grad_norm": 0.7445950618080178, + "learning_rate": 5.748784617958516e-07, + "loss": 1.5763, + "step": 4220 + }, + { + "epoch": 0.2941155976727171, + "grad_norm": 0.7090111805366819, + "learning_rate": 5.748234364127881e-07, + "loss": 1.6148, + "step": 4221 + }, + { + "epoch": 0.29418527680033446, + "grad_norm": 0.6931440631837924, + "learning_rate": 5.747684019329958e-07, + "loss": 1.4726, + "step": 4222 + }, + { + "epoch": 0.29425495592795176, + "grad_norm": 0.6616793591516503, + "learning_rate": 5.747133583591122e-07, + "loss": 1.495, + "step": 4223 + }, + { + "epoch": 0.2943246350555691, + "grad_norm": 0.7047857600329261, + "learning_rate": 5.746583056937749e-07, + "loss": 1.4379, + "step": 4224 + }, + { + "epoch": 0.2943943141831864, + "grad_norm": 0.7691171573820788, + "learning_rate": 5.746032439396223e-07, + "loss": 1.7682, + "step": 4225 + }, + { + "epoch": 0.29446399331080375, + "grad_norm": 0.6925654078498741, + "learning_rate": 5.745481730992929e-07, + "loss": 1.5043, + "step": 4226 + }, + { + "epoch": 0.29453367243842105, + "grad_norm": 0.7204041277305288, + "learning_rate": 5.74493093175426e-07, + "loss": 1.62, + "step": 4227 + }, + { + "epoch": 0.2946033515660384, + "grad_norm": 0.784007626003321, + "learning_rate": 5.74438004170661e-07, + "loss": 1.5427, + "step": 4228 + }, + { + "epoch": 0.2946730306936557, + "grad_norm": 0.7054496982926721, + "learning_rate": 5.743829060876379e-07, + "loss": 1.6203, + "step": 4229 + }, + { + "epoch": 0.29474270982127304, + "grad_norm": 0.7036820710436857, + "learning_rate": 5.743277989289972e-07, + "loss": 1.4374, + "step": 4230 + }, + { + "epoch": 0.29481238894889034, + "grad_norm": 0.7043837784856916, + "learning_rate": 5.742726826973797e-07, + "loss": 1.5413, + "step": 4231 + }, + { + "epoch": 0.2948820680765077, + "grad_norm": 0.7269167952414226, + "learning_rate": 5.742175573954266e-07, + "loss": 1.4719, + "step": 4232 + }, + { + "epoch": 0.294951747204125, + "grad_norm": 0.706476929301725, + "learning_rate": 5.741624230257798e-07, + "loss": 1.533, + "step": 4233 + }, + { + "epoch": 0.29502142633174233, + "grad_norm": 0.7356825908807404, + "learning_rate": 5.741072795910813e-07, + "loss": 1.437, + "step": 4234 + }, + { + "epoch": 0.2950911054593596, + "grad_norm": 0.7040136084896028, + "learning_rate": 5.740521270939737e-07, + "loss": 1.4609, + "step": 4235 + }, + { + "epoch": 0.295160784586977, + "grad_norm": 0.7340794981592375, + "learning_rate": 5.739969655371e-07, + "loss": 1.5591, + "step": 4236 + }, + { + "epoch": 0.29523046371459427, + "grad_norm": 0.6603536126599652, + "learning_rate": 5.739417949231038e-07, + "loss": 1.4947, + "step": 4237 + }, + { + "epoch": 0.2953001428422116, + "grad_norm": 0.7751036977717093, + "learning_rate": 5.73886615254629e-07, + "loss": 1.5717, + "step": 4238 + }, + { + "epoch": 0.2953698219698289, + "grad_norm": 0.791476669810281, + "learning_rate": 5.738314265343196e-07, + "loss": 1.5748, + "step": 4239 + }, + { + "epoch": 0.29543950109744627, + "grad_norm": 0.7362906055325384, + "learning_rate": 5.737762287648207e-07, + "loss": 1.605, + "step": 4240 + }, + { + "epoch": 0.29550918022506356, + "grad_norm": 0.723257144222618, + "learning_rate": 5.737210219487774e-07, + "loss": 1.4971, + "step": 4241 + }, + { + "epoch": 0.2955788593526809, + "grad_norm": 0.6947517250277025, + "learning_rate": 5.736658060888352e-07, + "loss": 1.5147, + "step": 4242 + }, + { + "epoch": 0.2956485384802982, + "grad_norm": 0.7270369921297178, + "learning_rate": 5.736105811876403e-07, + "loss": 1.492, + "step": 4243 + }, + { + "epoch": 0.29571821760791556, + "grad_norm": 0.708560313274988, + "learning_rate": 5.735553472478391e-07, + "loss": 1.3891, + "step": 4244 + }, + { + "epoch": 0.29578789673553285, + "grad_norm": 0.7285901567444775, + "learning_rate": 5.735001042720786e-07, + "loss": 1.6297, + "step": 4245 + }, + { + "epoch": 0.2958575758631502, + "grad_norm": 0.7960332124526845, + "learning_rate": 5.734448522630062e-07, + "loss": 1.5527, + "step": 4246 + }, + { + "epoch": 0.2959272549907675, + "grad_norm": 0.7020497113259067, + "learning_rate": 5.733895912232694e-07, + "loss": 1.6506, + "step": 4247 + }, + { + "epoch": 0.29599693411838485, + "grad_norm": 0.715234323713793, + "learning_rate": 5.733343211555169e-07, + "loss": 1.5195, + "step": 4248 + }, + { + "epoch": 0.29606661324600214, + "grad_norm": 0.7032592778948775, + "learning_rate": 5.732790420623969e-07, + "loss": 1.5182, + "step": 4249 + }, + { + "epoch": 0.2961362923736195, + "grad_norm": 0.6762321778401789, + "learning_rate": 5.732237539465586e-07, + "loss": 1.5153, + "step": 4250 + }, + { + "epoch": 0.2962059715012368, + "grad_norm": 0.6585325758077796, + "learning_rate": 5.731684568106518e-07, + "loss": 1.4654, + "step": 4251 + }, + { + "epoch": 0.29627565062885414, + "grad_norm": 0.7659023292111404, + "learning_rate": 5.731131506573262e-07, + "loss": 1.5343, + "step": 4252 + }, + { + "epoch": 0.29634532975647143, + "grad_norm": 0.7176805803072737, + "learning_rate": 5.730578354892322e-07, + "loss": 1.5905, + "step": 4253 + }, + { + "epoch": 0.2964150088840888, + "grad_norm": 0.698554222722932, + "learning_rate": 5.730025113090206e-07, + "loss": 1.486, + "step": 4254 + }, + { + "epoch": 0.2964846880117061, + "grad_norm": 0.7916029679362232, + "learning_rate": 5.729471781193427e-07, + "loss": 1.5445, + "step": 4255 + }, + { + "epoch": 0.2965543671393234, + "grad_norm": 0.6874181377702069, + "learning_rate": 5.728918359228502e-07, + "loss": 1.4352, + "step": 4256 + }, + { + "epoch": 0.2966240462669407, + "grad_norm": 0.7579198602498406, + "learning_rate": 5.728364847221953e-07, + "loss": 1.4514, + "step": 4257 + }, + { + "epoch": 0.29669372539455807, + "grad_norm": 0.7097472989631317, + "learning_rate": 5.727811245200302e-07, + "loss": 1.5043, + "step": 4258 + }, + { + "epoch": 0.29676340452217537, + "grad_norm": 0.6564182632603695, + "learning_rate": 5.727257553190083e-07, + "loss": 1.4526, + "step": 4259 + }, + { + "epoch": 0.2968330836497927, + "grad_norm": 0.7058523448481508, + "learning_rate": 5.726703771217827e-07, + "loss": 1.6799, + "step": 4260 + }, + { + "epoch": 0.29690276277741, + "grad_norm": 0.7245857438315936, + "learning_rate": 5.726149899310075e-07, + "loss": 1.5592, + "step": 4261 + }, + { + "epoch": 0.29697244190502736, + "grad_norm": 0.7315326028212079, + "learning_rate": 5.725595937493366e-07, + "loss": 1.5439, + "step": 4262 + }, + { + "epoch": 0.29704212103264466, + "grad_norm": 0.7195140259415446, + "learning_rate": 5.72504188579425e-07, + "loss": 1.4122, + "step": 4263 + }, + { + "epoch": 0.297111800160262, + "grad_norm": 0.7164306723650375, + "learning_rate": 5.724487744239278e-07, + "loss": 1.5216, + "step": 4264 + }, + { + "epoch": 0.2971814792878793, + "grad_norm": 0.7744066165079407, + "learning_rate": 5.723933512855005e-07, + "loss": 1.5427, + "step": 4265 + }, + { + "epoch": 0.29725115841549665, + "grad_norm": 0.6820385126931663, + "learning_rate": 5.72337919166799e-07, + "loss": 1.5005, + "step": 4266 + }, + { + "epoch": 0.29732083754311395, + "grad_norm": 0.7602095640104902, + "learning_rate": 5.7228247807048e-07, + "loss": 1.5856, + "step": 4267 + }, + { + "epoch": 0.2973905166707313, + "grad_norm": 0.7376863216947709, + "learning_rate": 5.722270279992e-07, + "loss": 1.5361, + "step": 4268 + }, + { + "epoch": 0.2974601957983486, + "grad_norm": 0.6976848335979555, + "learning_rate": 5.721715689556165e-07, + "loss": 1.6066, + "step": 4269 + }, + { + "epoch": 0.29752987492596594, + "grad_norm": 0.7387189895719177, + "learning_rate": 5.721161009423872e-07, + "loss": 1.5509, + "step": 4270 + }, + { + "epoch": 0.29759955405358324, + "grad_norm": 0.7470502890559497, + "learning_rate": 5.720606239621701e-07, + "loss": 1.5248, + "step": 4271 + }, + { + "epoch": 0.2976692331812006, + "grad_norm": 0.7507968058964529, + "learning_rate": 5.72005138017624e-07, + "loss": 1.5589, + "step": 4272 + }, + { + "epoch": 0.2977389123088179, + "grad_norm": 0.7697138737872022, + "learning_rate": 5.719496431114077e-07, + "loss": 1.4206, + "step": 4273 + }, + { + "epoch": 0.29780859143643523, + "grad_norm": 0.8122208452859123, + "learning_rate": 5.718941392461806e-07, + "loss": 1.7525, + "step": 4274 + }, + { + "epoch": 0.2978782705640525, + "grad_norm": 1.025275901310073, + "learning_rate": 5.718386264246029e-07, + "loss": 1.6115, + "step": 4275 + }, + { + "epoch": 0.2979479496916699, + "grad_norm": 0.7000608371774311, + "learning_rate": 5.717831046493345e-07, + "loss": 1.4787, + "step": 4276 + }, + { + "epoch": 0.2980176288192872, + "grad_norm": 0.7136322606534233, + "learning_rate": 5.717275739230363e-07, + "loss": 1.5461, + "step": 4277 + }, + { + "epoch": 0.2980873079469045, + "grad_norm": 0.747023635229759, + "learning_rate": 5.716720342483693e-07, + "loss": 1.6262, + "step": 4278 + }, + { + "epoch": 0.2981569870745218, + "grad_norm": 0.6962780662742107, + "learning_rate": 5.716164856279952e-07, + "loss": 1.5011, + "step": 4279 + }, + { + "epoch": 0.29822666620213917, + "grad_norm": 0.7586413862543497, + "learning_rate": 5.715609280645762e-07, + "loss": 1.5255, + "step": 4280 + }, + { + "epoch": 0.29829634532975646, + "grad_norm": 0.7048885106327465, + "learning_rate": 5.715053615607744e-07, + "loss": 1.5426, + "step": 4281 + }, + { + "epoch": 0.2983660244573738, + "grad_norm": 0.6822702776681451, + "learning_rate": 5.714497861192527e-07, + "loss": 1.564, + "step": 4282 + }, + { + "epoch": 0.2984357035849911, + "grad_norm": 0.738493339810172, + "learning_rate": 5.713942017426747e-07, + "loss": 1.4922, + "step": 4283 + }, + { + "epoch": 0.29850538271260846, + "grad_norm": 0.7083738712536051, + "learning_rate": 5.713386084337038e-07, + "loss": 1.3278, + "step": 4284 + }, + { + "epoch": 0.29857506184022575, + "grad_norm": 0.819880393354702, + "learning_rate": 5.712830061950042e-07, + "loss": 1.6089, + "step": 4285 + }, + { + "epoch": 0.2986447409678431, + "grad_norm": 0.7375639649661877, + "learning_rate": 5.712273950292404e-07, + "loss": 1.5276, + "step": 4286 + }, + { + "epoch": 0.2987144200954604, + "grad_norm": 0.7537522147243475, + "learning_rate": 5.711717749390776e-07, + "loss": 1.4919, + "step": 4287 + }, + { + "epoch": 0.29878409922307775, + "grad_norm": 0.6781966488725063, + "learning_rate": 5.711161459271812e-07, + "loss": 1.5011, + "step": 4288 + }, + { + "epoch": 0.29885377835069504, + "grad_norm": 0.8326959141972974, + "learning_rate": 5.710605079962171e-07, + "loss": 1.5864, + "step": 4289 + }, + { + "epoch": 0.2989234574783124, + "grad_norm": 0.7750678881838795, + "learning_rate": 5.710048611488512e-07, + "loss": 1.6775, + "step": 4290 + }, + { + "epoch": 0.2989931366059297, + "grad_norm": 0.7339000247889513, + "learning_rate": 5.709492053877506e-07, + "loss": 1.627, + "step": 4291 + }, + { + "epoch": 0.29906281573354704, + "grad_norm": 0.7341569990588388, + "learning_rate": 5.708935407155824e-07, + "loss": 1.5164, + "step": 4292 + }, + { + "epoch": 0.29913249486116433, + "grad_norm": 0.8037081802813852, + "learning_rate": 5.708378671350141e-07, + "loss": 1.4966, + "step": 4293 + }, + { + "epoch": 0.2992021739887817, + "grad_norm": 0.6627371432304198, + "learning_rate": 5.707821846487136e-07, + "loss": 1.3968, + "step": 4294 + }, + { + "epoch": 0.299271853116399, + "grad_norm": 0.767241188025273, + "learning_rate": 5.707264932593494e-07, + "loss": 1.6399, + "step": 4295 + }, + { + "epoch": 0.29934153224401633, + "grad_norm": 0.6914039403317371, + "learning_rate": 5.706707929695905e-07, + "loss": 1.5115, + "step": 4296 + }, + { + "epoch": 0.2994112113716336, + "grad_norm": 0.6980323825789414, + "learning_rate": 5.706150837821059e-07, + "loss": 1.4639, + "step": 4297 + }, + { + "epoch": 0.299480890499251, + "grad_norm": 0.7171078956726246, + "learning_rate": 5.705593656995654e-07, + "loss": 1.611, + "step": 4298 + }, + { + "epoch": 0.29955056962686827, + "grad_norm": 0.7332984331849871, + "learning_rate": 5.705036387246393e-07, + "loss": 1.6235, + "step": 4299 + }, + { + "epoch": 0.2996202487544856, + "grad_norm": 0.7008562486112845, + "learning_rate": 5.704479028599979e-07, + "loss": 1.486, + "step": 4300 + }, + { + "epoch": 0.2996899278821029, + "grad_norm": 0.764475428172602, + "learning_rate": 5.703921581083123e-07, + "loss": 1.5944, + "step": 4301 + }, + { + "epoch": 0.29975960700972026, + "grad_norm": 0.9630472761992942, + "learning_rate": 5.703364044722539e-07, + "loss": 1.6101, + "step": 4302 + }, + { + "epoch": 0.29982928613733756, + "grad_norm": 0.7357985164170848, + "learning_rate": 5.702806419544945e-07, + "loss": 1.5459, + "step": 4303 + }, + { + "epoch": 0.2998989652649549, + "grad_norm": 0.6860971242574258, + "learning_rate": 5.702248705577064e-07, + "loss": 1.3803, + "step": 4304 + }, + { + "epoch": 0.2999686443925722, + "grad_norm": 0.7307420289792195, + "learning_rate": 5.701690902845622e-07, + "loss": 1.5831, + "step": 4305 + }, + { + "epoch": 0.30003832352018955, + "grad_norm": 0.7160864235298279, + "learning_rate": 5.701133011377349e-07, + "loss": 1.4446, + "step": 4306 + }, + { + "epoch": 0.30010800264780685, + "grad_norm": 0.7127607913585317, + "learning_rate": 5.700575031198983e-07, + "loss": 1.6148, + "step": 4307 + }, + { + "epoch": 0.3001776817754242, + "grad_norm": 0.7371661893242732, + "learning_rate": 5.700016962337264e-07, + "loss": 1.584, + "step": 4308 + }, + { + "epoch": 0.3002473609030415, + "grad_norm": 0.6862056831220482, + "learning_rate": 5.699458804818933e-07, + "loss": 1.5426, + "step": 4309 + }, + { + "epoch": 0.30031704003065884, + "grad_norm": 0.9409326647764209, + "learning_rate": 5.698900558670737e-07, + "loss": 1.5583, + "step": 4310 + }, + { + "epoch": 0.30038671915827614, + "grad_norm": 0.707354464843671, + "learning_rate": 5.698342223919433e-07, + "loss": 1.3867, + "step": 4311 + }, + { + "epoch": 0.30045639828589343, + "grad_norm": 0.6662497690540862, + "learning_rate": 5.697783800591775e-07, + "loss": 1.5286, + "step": 4312 + }, + { + "epoch": 0.3005260774135108, + "grad_norm": 0.6995012600576833, + "learning_rate": 5.697225288714523e-07, + "loss": 1.4394, + "step": 4313 + }, + { + "epoch": 0.3005957565411281, + "grad_norm": 0.6587298537731207, + "learning_rate": 5.696666688314442e-07, + "loss": 1.5089, + "step": 4314 + }, + { + "epoch": 0.30066543566874543, + "grad_norm": 0.6730697833442947, + "learning_rate": 5.696107999418305e-07, + "loss": 1.5451, + "step": 4315 + }, + { + "epoch": 0.3007351147963627, + "grad_norm": 0.7094606803913679, + "learning_rate": 5.69554922205288e-07, + "loss": 1.4908, + "step": 4316 + }, + { + "epoch": 0.3008047939239801, + "grad_norm": 0.6509023335460346, + "learning_rate": 5.69499035624495e-07, + "loss": 1.5202, + "step": 4317 + }, + { + "epoch": 0.30087447305159737, + "grad_norm": 0.7491001156354492, + "learning_rate": 5.694431402021292e-07, + "loss": 1.5196, + "step": 4318 + }, + { + "epoch": 0.3009441521792147, + "grad_norm": 0.6870728473339202, + "learning_rate": 5.693872359408696e-07, + "loss": 1.4587, + "step": 4319 + }, + { + "epoch": 0.301013831306832, + "grad_norm": 0.6913259421135415, + "learning_rate": 5.69331322843395e-07, + "loss": 1.6039, + "step": 4320 + }, + { + "epoch": 0.30108351043444936, + "grad_norm": 0.7570791386505102, + "learning_rate": 5.69275400912385e-07, + "loss": 1.6491, + "step": 4321 + }, + { + "epoch": 0.30115318956206666, + "grad_norm": 0.7419386183508703, + "learning_rate": 5.692194701505195e-07, + "loss": 1.5979, + "step": 4322 + }, + { + "epoch": 0.301222868689684, + "grad_norm": 0.7168472397929991, + "learning_rate": 5.691635305604789e-07, + "loss": 1.6062, + "step": 4323 + }, + { + "epoch": 0.3012925478173013, + "grad_norm": 0.7273318013404496, + "learning_rate": 5.691075821449437e-07, + "loss": 1.5357, + "step": 4324 + }, + { + "epoch": 0.30136222694491865, + "grad_norm": 0.703456514068365, + "learning_rate": 5.690516249065953e-07, + "loss": 1.7195, + "step": 4325 + }, + { + "epoch": 0.30143190607253595, + "grad_norm": 0.7146216193702607, + "learning_rate": 5.689956588481151e-07, + "loss": 1.5897, + "step": 4326 + }, + { + "epoch": 0.3015015852001533, + "grad_norm": 0.6979947721924662, + "learning_rate": 5.689396839721853e-07, + "loss": 1.5119, + "step": 4327 + }, + { + "epoch": 0.3015712643277706, + "grad_norm": 0.7286369517144264, + "learning_rate": 5.688837002814881e-07, + "loss": 1.5625, + "step": 4328 + }, + { + "epoch": 0.30164094345538794, + "grad_norm": 0.7002598809033803, + "learning_rate": 5.688277077787065e-07, + "loss": 1.4626, + "step": 4329 + }, + { + "epoch": 0.30171062258300524, + "grad_norm": 0.7152829336567289, + "learning_rate": 5.687717064665239e-07, + "loss": 1.6106, + "step": 4330 + }, + { + "epoch": 0.3017803017106226, + "grad_norm": 0.7341268521670578, + "learning_rate": 5.687156963476236e-07, + "loss": 1.5397, + "step": 4331 + }, + { + "epoch": 0.3018499808382399, + "grad_norm": 0.6993141305033542, + "learning_rate": 5.686596774246903e-07, + "loss": 1.4845, + "step": 4332 + }, + { + "epoch": 0.30191965996585723, + "grad_norm": 0.7425939500950345, + "learning_rate": 5.686036497004079e-07, + "loss": 1.5275, + "step": 4333 + }, + { + "epoch": 0.30198933909347453, + "grad_norm": 0.8728770492993477, + "learning_rate": 5.685476131774617e-07, + "loss": 1.617, + "step": 4334 + }, + { + "epoch": 0.3020590182210919, + "grad_norm": 0.7165920132387991, + "learning_rate": 5.684915678585372e-07, + "loss": 1.4761, + "step": 4335 + }, + { + "epoch": 0.3021286973487092, + "grad_norm": 0.7206231125446654, + "learning_rate": 5.684355137463201e-07, + "loss": 1.6038, + "step": 4336 + }, + { + "epoch": 0.3021983764763265, + "grad_norm": 0.7461449598093063, + "learning_rate": 5.683794508434965e-07, + "loss": 1.4806, + "step": 4337 + }, + { + "epoch": 0.3022680556039438, + "grad_norm": 0.7326703682099187, + "learning_rate": 5.683233791527532e-07, + "loss": 1.5679, + "step": 4338 + }, + { + "epoch": 0.30233773473156117, + "grad_norm": 0.7098570289778796, + "learning_rate": 5.682672986767771e-07, + "loss": 1.6465, + "step": 4339 + }, + { + "epoch": 0.30240741385917846, + "grad_norm": 0.7338137182161039, + "learning_rate": 5.682112094182559e-07, + "loss": 1.5681, + "step": 4340 + }, + { + "epoch": 0.3024770929867958, + "grad_norm": 0.7598766115415722, + "learning_rate": 5.681551113798774e-07, + "loss": 1.7009, + "step": 4341 + }, + { + "epoch": 0.3025467721144131, + "grad_norm": 0.6912780506537121, + "learning_rate": 5.680990045643299e-07, + "loss": 1.4817, + "step": 4342 + }, + { + "epoch": 0.30261645124203046, + "grad_norm": 0.7008636273971501, + "learning_rate": 5.680428889743023e-07, + "loss": 1.5617, + "step": 4343 + }, + { + "epoch": 0.30268613036964775, + "grad_norm": 0.7017393439038174, + "learning_rate": 5.679867646124837e-07, + "loss": 1.5042, + "step": 4344 + }, + { + "epoch": 0.3027558094972651, + "grad_norm": 0.7272041861547763, + "learning_rate": 5.679306314815636e-07, + "loss": 1.4487, + "step": 4345 + }, + { + "epoch": 0.3028254886248824, + "grad_norm": 0.7117474417769055, + "learning_rate": 5.678744895842321e-07, + "loss": 1.4574, + "step": 4346 + }, + { + "epoch": 0.30289516775249975, + "grad_norm": 0.7026478170110361, + "learning_rate": 5.678183389231796e-07, + "loss": 1.4453, + "step": 4347 + }, + { + "epoch": 0.30296484688011704, + "grad_norm": 0.713541128071128, + "learning_rate": 5.67762179501097e-07, + "loss": 1.5474, + "step": 4348 + }, + { + "epoch": 0.3030345260077344, + "grad_norm": 0.7447825123847247, + "learning_rate": 5.677060113206756e-07, + "loss": 1.5962, + "step": 4349 + }, + { + "epoch": 0.3031042051353517, + "grad_norm": 0.6790629074737846, + "learning_rate": 5.67649834384607e-07, + "loss": 1.629, + "step": 4350 + }, + { + "epoch": 0.30317388426296904, + "grad_norm": 0.7346346969447306, + "learning_rate": 5.675936486955834e-07, + "loss": 1.6648, + "step": 4351 + }, + { + "epoch": 0.30324356339058633, + "grad_norm": 0.7110715667981556, + "learning_rate": 5.675374542562973e-07, + "loss": 1.5896, + "step": 4352 + }, + { + "epoch": 0.3033132425182037, + "grad_norm": 0.7142436991764997, + "learning_rate": 5.674812510694416e-07, + "loss": 1.5098, + "step": 4353 + }, + { + "epoch": 0.303382921645821, + "grad_norm": 0.7105548896306177, + "learning_rate": 5.674250391377097e-07, + "loss": 1.5081, + "step": 4354 + }, + { + "epoch": 0.30345260077343833, + "grad_norm": 0.7322611956071579, + "learning_rate": 5.673688184637956e-07, + "loss": 1.5588, + "step": 4355 + }, + { + "epoch": 0.3035222799010556, + "grad_norm": 0.6954733835570996, + "learning_rate": 5.673125890503932e-07, + "loss": 1.4469, + "step": 4356 + }, + { + "epoch": 0.303591959028673, + "grad_norm": 0.7341910322709105, + "learning_rate": 5.672563509001972e-07, + "loss": 1.4872, + "step": 4357 + }, + { + "epoch": 0.30366163815629027, + "grad_norm": 0.688288685718414, + "learning_rate": 5.672001040159026e-07, + "loss": 1.4781, + "step": 4358 + }, + { + "epoch": 0.3037313172839076, + "grad_norm": 0.7411868407592087, + "learning_rate": 5.67143848400205e-07, + "loss": 1.6026, + "step": 4359 + }, + { + "epoch": 0.3038009964115249, + "grad_norm": 0.6503515017270078, + "learning_rate": 5.670875840558005e-07, + "loss": 1.4431, + "step": 4360 + }, + { + "epoch": 0.30387067553914227, + "grad_norm": 0.7775967475885618, + "learning_rate": 5.67031310985385e-07, + "loss": 1.5814, + "step": 4361 + }, + { + "epoch": 0.30394035466675956, + "grad_norm": 0.6685209900171873, + "learning_rate": 5.669750291916554e-07, + "loss": 1.5307, + "step": 4362 + }, + { + "epoch": 0.3040100337943769, + "grad_norm": 0.7203373515957084, + "learning_rate": 5.669187386773087e-07, + "loss": 1.5483, + "step": 4363 + }, + { + "epoch": 0.3040797129219942, + "grad_norm": 0.7274571102331661, + "learning_rate": 5.668624394450428e-07, + "loss": 1.4758, + "step": 4364 + }, + { + "epoch": 0.30414939204961156, + "grad_norm": 0.7206220254266728, + "learning_rate": 5.668061314975553e-07, + "loss": 1.4899, + "step": 4365 + }, + { + "epoch": 0.30421907117722885, + "grad_norm": 0.710236911651575, + "learning_rate": 5.667498148375447e-07, + "loss": 1.642, + "step": 4366 + }, + { + "epoch": 0.3042887503048462, + "grad_norm": 0.7139976815719326, + "learning_rate": 5.666934894677099e-07, + "loss": 1.5409, + "step": 4367 + }, + { + "epoch": 0.3043584294324635, + "grad_norm": 0.6954576104030126, + "learning_rate": 5.666371553907501e-07, + "loss": 1.434, + "step": 4368 + }, + { + "epoch": 0.30442810856008085, + "grad_norm": 0.7030492892506666, + "learning_rate": 5.665808126093649e-07, + "loss": 1.3851, + "step": 4369 + }, + { + "epoch": 0.30449778768769814, + "grad_norm": 0.6610603452433989, + "learning_rate": 5.665244611262543e-07, + "loss": 1.4934, + "step": 4370 + }, + { + "epoch": 0.3045674668153155, + "grad_norm": 0.7353844159537593, + "learning_rate": 5.66468100944119e-07, + "loss": 1.5959, + "step": 4371 + }, + { + "epoch": 0.3046371459429328, + "grad_norm": 0.7180170447023938, + "learning_rate": 5.664117320656596e-07, + "loss": 1.5525, + "step": 4372 + }, + { + "epoch": 0.30470682507055014, + "grad_norm": 0.7076752320951577, + "learning_rate": 5.663553544935777e-07, + "loss": 1.4768, + "step": 4373 + }, + { + "epoch": 0.30477650419816743, + "grad_norm": 0.7305586044258503, + "learning_rate": 5.662989682305748e-07, + "loss": 1.6246, + "step": 4374 + }, + { + "epoch": 0.3048461833257848, + "grad_norm": 0.7558819635443185, + "learning_rate": 5.662425732793532e-07, + "loss": 1.5732, + "step": 4375 + }, + { + "epoch": 0.3049158624534021, + "grad_norm": 0.7153477087807026, + "learning_rate": 5.661861696426154e-07, + "loss": 1.4463, + "step": 4376 + }, + { + "epoch": 0.3049855415810194, + "grad_norm": 0.7263139382865508, + "learning_rate": 5.661297573230644e-07, + "loss": 1.5023, + "step": 4377 + }, + { + "epoch": 0.3050552207086367, + "grad_norm": 0.7001941576629662, + "learning_rate": 5.660733363234035e-07, + "loss": 1.4831, + "step": 4378 + }, + { + "epoch": 0.30512489983625407, + "grad_norm": 0.7056967216013887, + "learning_rate": 5.660169066463367e-07, + "loss": 1.4512, + "step": 4379 + }, + { + "epoch": 0.30519457896387137, + "grad_norm": 0.722321962015521, + "learning_rate": 5.659604682945681e-07, + "loss": 1.6328, + "step": 4380 + }, + { + "epoch": 0.3052642580914887, + "grad_norm": 0.7459925387500808, + "learning_rate": 5.659040212708024e-07, + "loss": 1.5597, + "step": 4381 + }, + { + "epoch": 0.305333937219106, + "grad_norm": 0.6985403031047768, + "learning_rate": 5.658475655777445e-07, + "loss": 1.5844, + "step": 4382 + }, + { + "epoch": 0.30540361634672336, + "grad_norm": 0.6924190337217615, + "learning_rate": 5.657911012180999e-07, + "loss": 1.4897, + "step": 4383 + }, + { + "epoch": 0.30547329547434066, + "grad_norm": 0.6857257208268763, + "learning_rate": 5.657346281945748e-07, + "loss": 1.5364, + "step": 4384 + }, + { + "epoch": 0.305542974601958, + "grad_norm": 0.6960778169774027, + "learning_rate": 5.65678146509875e-07, + "loss": 1.6121, + "step": 4385 + }, + { + "epoch": 0.3056126537295753, + "grad_norm": 0.7042377335398031, + "learning_rate": 5.656216561667078e-07, + "loss": 1.6275, + "step": 4386 + }, + { + "epoch": 0.30568233285719265, + "grad_norm": 0.6764060757496547, + "learning_rate": 5.655651571677797e-07, + "loss": 1.5309, + "step": 4387 + }, + { + "epoch": 0.30575201198480995, + "grad_norm": 0.7420151820599352, + "learning_rate": 5.655086495157989e-07, + "loss": 1.515, + "step": 4388 + }, + { + "epoch": 0.3058216911124273, + "grad_norm": 0.7454176035431609, + "learning_rate": 5.654521332134729e-07, + "loss": 1.5449, + "step": 4389 + }, + { + "epoch": 0.3058913702400446, + "grad_norm": 0.7166026979384447, + "learning_rate": 5.653956082635102e-07, + "loss": 1.4952, + "step": 4390 + }, + { + "epoch": 0.30596104936766194, + "grad_norm": 0.696252526086783, + "learning_rate": 5.653390746686195e-07, + "loss": 1.4843, + "step": 4391 + }, + { + "epoch": 0.30603072849527924, + "grad_norm": 0.7198107998452128, + "learning_rate": 5.652825324315103e-07, + "loss": 1.5638, + "step": 4392 + }, + { + "epoch": 0.3061004076228966, + "grad_norm": 0.7111816471169524, + "learning_rate": 5.652259815548919e-07, + "loss": 1.6544, + "step": 4393 + }, + { + "epoch": 0.3061700867505139, + "grad_norm": 0.7687614335199426, + "learning_rate": 5.651694220414745e-07, + "loss": 1.5855, + "step": 4394 + }, + { + "epoch": 0.30623976587813123, + "grad_norm": 0.7454634904231932, + "learning_rate": 5.651128538939687e-07, + "loss": 1.6859, + "step": 4395 + }, + { + "epoch": 0.3063094450057485, + "grad_norm": 0.6789328892800132, + "learning_rate": 5.65056277115085e-07, + "loss": 1.4511, + "step": 4396 + }, + { + "epoch": 0.3063791241333659, + "grad_norm": 0.6967181617814054, + "learning_rate": 5.649996917075348e-07, + "loss": 1.5627, + "step": 4397 + }, + { + "epoch": 0.30644880326098317, + "grad_norm": 0.7221567523229675, + "learning_rate": 5.649430976740299e-07, + "loss": 1.4447, + "step": 4398 + }, + { + "epoch": 0.3065184823886005, + "grad_norm": 0.7419218988374826, + "learning_rate": 5.648864950172825e-07, + "loss": 1.6038, + "step": 4399 + }, + { + "epoch": 0.3065881615162178, + "grad_norm": 0.7543528666991685, + "learning_rate": 5.648298837400047e-07, + "loss": 1.5557, + "step": 4400 + }, + { + "epoch": 0.30665784064383517, + "grad_norm": 0.672566336166264, + "learning_rate": 5.647732638449098e-07, + "loss": 1.5224, + "step": 4401 + }, + { + "epoch": 0.30672751977145246, + "grad_norm": 0.6647018335496818, + "learning_rate": 5.64716635334711e-07, + "loss": 1.5032, + "step": 4402 + }, + { + "epoch": 0.30679719889906976, + "grad_norm": 0.7035658712733628, + "learning_rate": 5.646599982121222e-07, + "loss": 1.499, + "step": 4403 + }, + { + "epoch": 0.3068668780266871, + "grad_norm": 0.7324652829978336, + "learning_rate": 5.646033524798572e-07, + "loss": 1.6015, + "step": 4404 + }, + { + "epoch": 0.3069365571543044, + "grad_norm": 0.6581572710664613, + "learning_rate": 5.645466981406311e-07, + "loss": 1.4681, + "step": 4405 + }, + { + "epoch": 0.30700623628192175, + "grad_norm": 0.7379880634218996, + "learning_rate": 5.644900351971586e-07, + "loss": 1.5125, + "step": 4406 + }, + { + "epoch": 0.30707591540953905, + "grad_norm": 0.6937990883394584, + "learning_rate": 5.644333636521549e-07, + "loss": 1.534, + "step": 4407 + }, + { + "epoch": 0.3071455945371564, + "grad_norm": 0.709535109038696, + "learning_rate": 5.643766835083363e-07, + "loss": 1.5428, + "step": 4408 + }, + { + "epoch": 0.3072152736647737, + "grad_norm": 0.7080500790253447, + "learning_rate": 5.643199947684187e-07, + "loss": 1.6857, + "step": 4409 + }, + { + "epoch": 0.30728495279239104, + "grad_norm": 0.7306898899751795, + "learning_rate": 5.642632974351187e-07, + "loss": 1.5052, + "step": 4410 + }, + { + "epoch": 0.30735463192000834, + "grad_norm": 0.6916557398320974, + "learning_rate": 5.642065915111535e-07, + "loss": 1.5313, + "step": 4411 + }, + { + "epoch": 0.3074243110476257, + "grad_norm": 0.7456137117488936, + "learning_rate": 5.641498769992406e-07, + "loss": 1.6002, + "step": 4412 + }, + { + "epoch": 0.307493990175243, + "grad_norm": 0.7286947464768576, + "learning_rate": 5.640931539020978e-07, + "loss": 1.5603, + "step": 4413 + }, + { + "epoch": 0.30756366930286033, + "grad_norm": 0.7296045551328778, + "learning_rate": 5.640364222224435e-07, + "loss": 1.472, + "step": 4414 + }, + { + "epoch": 0.3076333484304776, + "grad_norm": 0.7188974984129131, + "learning_rate": 5.63979681962996e-07, + "loss": 1.5777, + "step": 4415 + }, + { + "epoch": 0.307703027558095, + "grad_norm": 0.6987921504744042, + "learning_rate": 5.639229331264748e-07, + "loss": 1.5582, + "step": 4416 + }, + { + "epoch": 0.30777270668571227, + "grad_norm": 0.6733705792089942, + "learning_rate": 5.638661757155995e-07, + "loss": 1.4362, + "step": 4417 + }, + { + "epoch": 0.3078423858133296, + "grad_norm": 0.7532019974955833, + "learning_rate": 5.638094097330898e-07, + "loss": 1.5592, + "step": 4418 + }, + { + "epoch": 0.3079120649409469, + "grad_norm": 0.7362829024275824, + "learning_rate": 5.63752635181666e-07, + "loss": 1.5044, + "step": 4419 + }, + { + "epoch": 0.30798174406856427, + "grad_norm": 0.6879131082588306, + "learning_rate": 5.63695852064049e-07, + "loss": 1.5981, + "step": 4420 + }, + { + "epoch": 0.30805142319618156, + "grad_norm": 0.7261822695523283, + "learning_rate": 5.636390603829599e-07, + "loss": 1.7112, + "step": 4421 + }, + { + "epoch": 0.3081211023237989, + "grad_norm": 0.8396134526299301, + "learning_rate": 5.635822601411203e-07, + "loss": 1.5046, + "step": 4422 + }, + { + "epoch": 0.3081907814514162, + "grad_norm": 0.6924912458603729, + "learning_rate": 5.635254513412522e-07, + "loss": 1.5348, + "step": 4423 + }, + { + "epoch": 0.30826046057903356, + "grad_norm": 0.7804285234866137, + "learning_rate": 5.634686339860779e-07, + "loss": 1.5138, + "step": 4424 + }, + { + "epoch": 0.30833013970665085, + "grad_norm": 0.7762150856399619, + "learning_rate": 5.634118080783203e-07, + "loss": 1.5775, + "step": 4425 + }, + { + "epoch": 0.3083998188342682, + "grad_norm": 0.7025050135748799, + "learning_rate": 5.633549736207026e-07, + "loss": 1.4535, + "step": 4426 + }, + { + "epoch": 0.3084694979618855, + "grad_norm": 0.7092815664252987, + "learning_rate": 5.632981306159483e-07, + "loss": 1.5258, + "step": 4427 + }, + { + "epoch": 0.30853917708950285, + "grad_norm": 0.7130146811467492, + "learning_rate": 5.632412790667818e-07, + "loss": 1.6053, + "step": 4428 + }, + { + "epoch": 0.30860885621712014, + "grad_norm": 0.7943626302984487, + "learning_rate": 5.631844189759271e-07, + "loss": 1.5869, + "step": 4429 + }, + { + "epoch": 0.3086785353447375, + "grad_norm": 0.7409423035412404, + "learning_rate": 5.631275503461091e-07, + "loss": 1.4726, + "step": 4430 + }, + { + "epoch": 0.3087482144723548, + "grad_norm": 0.7429732104868719, + "learning_rate": 5.630706731800535e-07, + "loss": 1.5541, + "step": 4431 + }, + { + "epoch": 0.30881789359997214, + "grad_norm": 0.7551340123253788, + "learning_rate": 5.630137874804855e-07, + "loss": 1.711, + "step": 4432 + }, + { + "epoch": 0.30888757272758943, + "grad_norm": 0.6698510994317796, + "learning_rate": 5.629568932501314e-07, + "loss": 1.4954, + "step": 4433 + }, + { + "epoch": 0.3089572518552068, + "grad_norm": 0.7099811918718965, + "learning_rate": 5.628999904917175e-07, + "loss": 1.4815, + "step": 4434 + }, + { + "epoch": 0.3090269309828241, + "grad_norm": 0.7550362727314823, + "learning_rate": 5.62843079207971e-07, + "loss": 1.5581, + "step": 4435 + }, + { + "epoch": 0.3090966101104414, + "grad_norm": 0.7185400605901908, + "learning_rate": 5.62786159401619e-07, + "loss": 1.5663, + "step": 4436 + }, + { + "epoch": 0.3091662892380587, + "grad_norm": 0.6874944643037497, + "learning_rate": 5.627292310753892e-07, + "loss": 1.6189, + "step": 4437 + }, + { + "epoch": 0.3092359683656761, + "grad_norm": 0.6862872862618756, + "learning_rate": 5.626722942320098e-07, + "loss": 1.526, + "step": 4438 + }, + { + "epoch": 0.30930564749329337, + "grad_norm": 0.777117696116382, + "learning_rate": 5.626153488742094e-07, + "loss": 1.609, + "step": 4439 + }, + { + "epoch": 0.3093753266209107, + "grad_norm": 0.7148010353520651, + "learning_rate": 5.625583950047168e-07, + "loss": 1.5535, + "step": 4440 + }, + { + "epoch": 0.309445005748528, + "grad_norm": 0.7045498396728237, + "learning_rate": 5.625014326262612e-07, + "loss": 1.5452, + "step": 4441 + }, + { + "epoch": 0.30951468487614536, + "grad_norm": 0.6919278984568086, + "learning_rate": 5.624444617415727e-07, + "loss": 1.4555, + "step": 4442 + }, + { + "epoch": 0.30958436400376266, + "grad_norm": 0.6905139175639234, + "learning_rate": 5.623874823533813e-07, + "loss": 1.5853, + "step": 4443 + }, + { + "epoch": 0.30965404313138, + "grad_norm": 0.7165338368059605, + "learning_rate": 5.623304944644177e-07, + "loss": 1.5571, + "step": 4444 + }, + { + "epoch": 0.3097237222589973, + "grad_norm": 0.7097791827485946, + "learning_rate": 5.622734980774126e-07, + "loss": 1.4597, + "step": 4445 + }, + { + "epoch": 0.30979340138661465, + "grad_norm": 0.6880495509954674, + "learning_rate": 5.622164931950975e-07, + "loss": 1.4994, + "step": 4446 + }, + { + "epoch": 0.30986308051423195, + "grad_norm": 0.7265350063500531, + "learning_rate": 5.621594798202044e-07, + "loss": 1.5729, + "step": 4447 + }, + { + "epoch": 0.3099327596418493, + "grad_norm": 0.7271006631212064, + "learning_rate": 5.621024579554652e-07, + "loss": 1.6254, + "step": 4448 + }, + { + "epoch": 0.3100024387694666, + "grad_norm": 0.7172550132449438, + "learning_rate": 5.620454276036126e-07, + "loss": 1.5689, + "step": 4449 + }, + { + "epoch": 0.31007211789708394, + "grad_norm": 0.6859933153200322, + "learning_rate": 5.619883887673798e-07, + "loss": 1.5004, + "step": 4450 + }, + { + "epoch": 0.31014179702470124, + "grad_norm": 0.747765073619636, + "learning_rate": 5.619313414494999e-07, + "loss": 1.5306, + "step": 4451 + }, + { + "epoch": 0.3102114761523186, + "grad_norm": 0.7128484452214556, + "learning_rate": 5.618742856527069e-07, + "loss": 1.5639, + "step": 4452 + }, + { + "epoch": 0.3102811552799359, + "grad_norm": 0.7681613031122471, + "learning_rate": 5.618172213797351e-07, + "loss": 1.6, + "step": 4453 + }, + { + "epoch": 0.31035083440755323, + "grad_norm": 0.7524879920443474, + "learning_rate": 5.617601486333189e-07, + "loss": 1.5381, + "step": 4454 + }, + { + "epoch": 0.31042051353517053, + "grad_norm": 0.7252015588543342, + "learning_rate": 5.617030674161936e-07, + "loss": 1.6141, + "step": 4455 + }, + { + "epoch": 0.3104901926627879, + "grad_norm": 0.7135283316475586, + "learning_rate": 5.616459777310946e-07, + "loss": 1.6824, + "step": 4456 + }, + { + "epoch": 0.3105598717904052, + "grad_norm": 0.7234761560418117, + "learning_rate": 5.615888795807577e-07, + "loss": 1.5512, + "step": 4457 + }, + { + "epoch": 0.3106295509180225, + "grad_norm": 0.6888581693005156, + "learning_rate": 5.61531772967919e-07, + "loss": 1.4997, + "step": 4458 + }, + { + "epoch": 0.3106992300456398, + "grad_norm": 0.7150965852736714, + "learning_rate": 5.614746578953155e-07, + "loss": 1.6749, + "step": 4459 + }, + { + "epoch": 0.31076890917325717, + "grad_norm": 0.685877969315356, + "learning_rate": 5.61417534365684e-07, + "loss": 1.5597, + "step": 4460 + }, + { + "epoch": 0.31083858830087446, + "grad_norm": 0.6710895076279765, + "learning_rate": 5.613604023817622e-07, + "loss": 1.5663, + "step": 4461 + }, + { + "epoch": 0.3109082674284918, + "grad_norm": 0.7201758805730143, + "learning_rate": 5.613032619462877e-07, + "loss": 1.534, + "step": 4462 + }, + { + "epoch": 0.3109779465561091, + "grad_norm": 0.7009125282431902, + "learning_rate": 5.612461130619991e-07, + "loss": 1.5826, + "step": 4463 + }, + { + "epoch": 0.31104762568372646, + "grad_norm": 0.7343800397441643, + "learning_rate": 5.611889557316349e-07, + "loss": 1.6119, + "step": 4464 + }, + { + "epoch": 0.31111730481134375, + "grad_norm": 0.6762556387092465, + "learning_rate": 5.611317899579342e-07, + "loss": 1.5175, + "step": 4465 + }, + { + "epoch": 0.3111869839389611, + "grad_norm": 0.7320624779852928, + "learning_rate": 5.610746157436364e-07, + "loss": 1.5275, + "step": 4466 + }, + { + "epoch": 0.3112566630665784, + "grad_norm": 0.6697280425825357, + "learning_rate": 5.610174330914817e-07, + "loss": 1.4777, + "step": 4467 + }, + { + "epoch": 0.31132634219419575, + "grad_norm": 0.7468461994111188, + "learning_rate": 5.609602420042102e-07, + "loss": 1.6182, + "step": 4468 + }, + { + "epoch": 0.31139602132181304, + "grad_norm": 0.7355989195989832, + "learning_rate": 5.609030424845627e-07, + "loss": 1.5292, + "step": 4469 + }, + { + "epoch": 0.3114657004494304, + "grad_norm": 0.7209632348788007, + "learning_rate": 5.608458345352802e-07, + "loss": 1.5888, + "step": 4470 + }, + { + "epoch": 0.3115353795770477, + "grad_norm": 0.6846198121785194, + "learning_rate": 5.607886181591043e-07, + "loss": 1.487, + "step": 4471 + }, + { + "epoch": 0.31160505870466504, + "grad_norm": 0.7210029872091488, + "learning_rate": 5.60731393358777e-07, + "loss": 1.5018, + "step": 4472 + }, + { + "epoch": 0.31167473783228233, + "grad_norm": 0.7261186538094028, + "learning_rate": 5.606741601370406e-07, + "loss": 1.4957, + "step": 4473 + }, + { + "epoch": 0.3117444169598997, + "grad_norm": 0.7004930303444802, + "learning_rate": 5.606169184966377e-07, + "loss": 1.4821, + "step": 4474 + }, + { + "epoch": 0.311814096087517, + "grad_norm": 0.6922215262031016, + "learning_rate": 5.605596684403115e-07, + "loss": 1.4644, + "step": 4475 + }, + { + "epoch": 0.31188377521513433, + "grad_norm": 0.7123564721272889, + "learning_rate": 5.605024099708058e-07, + "loss": 1.4204, + "step": 4476 + }, + { + "epoch": 0.3119534543427516, + "grad_norm": 0.6901484934307104, + "learning_rate": 5.60445143090864e-07, + "loss": 1.4364, + "step": 4477 + }, + { + "epoch": 0.312023133470369, + "grad_norm": 0.7434500197807802, + "learning_rate": 5.60387867803231e-07, + "loss": 1.5128, + "step": 4478 + }, + { + "epoch": 0.31209281259798627, + "grad_norm": 0.6970836184981862, + "learning_rate": 5.603305841106511e-07, + "loss": 1.5619, + "step": 4479 + }, + { + "epoch": 0.3121624917256036, + "grad_norm": 0.7172415376307196, + "learning_rate": 5.6027329201587e-07, + "loss": 1.6662, + "step": 4480 + }, + { + "epoch": 0.3122321708532209, + "grad_norm": 0.724545533459188, + "learning_rate": 5.602159915216326e-07, + "loss": 1.4708, + "step": 4481 + }, + { + "epoch": 0.31230184998083826, + "grad_norm": 0.6768537107167042, + "learning_rate": 5.601586826306853e-07, + "loss": 1.5061, + "step": 4482 + }, + { + "epoch": 0.31237152910845556, + "grad_norm": 0.7007067821148911, + "learning_rate": 5.601013653457743e-07, + "loss": 1.503, + "step": 4483 + }, + { + "epoch": 0.3124412082360729, + "grad_norm": 0.7139890398072082, + "learning_rate": 5.600440396696465e-07, + "loss": 1.5266, + "step": 4484 + }, + { + "epoch": 0.3125108873636902, + "grad_norm": 0.7267770350004812, + "learning_rate": 5.599867056050489e-07, + "loss": 1.4927, + "step": 4485 + }, + { + "epoch": 0.31258056649130755, + "grad_norm": 0.7346092705739481, + "learning_rate": 5.599293631547289e-07, + "loss": 1.5618, + "step": 4486 + }, + { + "epoch": 0.31265024561892485, + "grad_norm": 0.6883420305557534, + "learning_rate": 5.59872012321435e-07, + "loss": 1.4841, + "step": 4487 + }, + { + "epoch": 0.3127199247465422, + "grad_norm": 0.8690928931163032, + "learning_rate": 5.598146531079151e-07, + "loss": 1.6252, + "step": 4488 + }, + { + "epoch": 0.3127896038741595, + "grad_norm": 0.7386897051184834, + "learning_rate": 5.597572855169182e-07, + "loss": 1.5784, + "step": 4489 + }, + { + "epoch": 0.31285928300177684, + "grad_norm": 0.7397508406451234, + "learning_rate": 5.596999095511935e-07, + "loss": 1.5716, + "step": 4490 + }, + { + "epoch": 0.31292896212939414, + "grad_norm": 0.7126692429917938, + "learning_rate": 5.596425252134903e-07, + "loss": 1.5979, + "step": 4491 + }, + { + "epoch": 0.3129986412570115, + "grad_norm": 0.6840682521617562, + "learning_rate": 5.595851325065588e-07, + "loss": 1.6575, + "step": 4492 + }, + { + "epoch": 0.3130683203846288, + "grad_norm": 0.7597470315412264, + "learning_rate": 5.595277314331495e-07, + "loss": 1.5514, + "step": 4493 + }, + { + "epoch": 0.3131379995122461, + "grad_norm": 0.6596496330675966, + "learning_rate": 5.594703219960127e-07, + "loss": 1.4768, + "step": 4494 + }, + { + "epoch": 0.31320767863986343, + "grad_norm": 0.7101073477506992, + "learning_rate": 5.594129041979001e-07, + "loss": 1.4752, + "step": 4495 + }, + { + "epoch": 0.3132773577674807, + "grad_norm": 0.8087865878948145, + "learning_rate": 5.593554780415632e-07, + "loss": 1.5729, + "step": 4496 + }, + { + "epoch": 0.3133470368950981, + "grad_norm": 0.7625992856773187, + "learning_rate": 5.592980435297535e-07, + "loss": 1.5422, + "step": 4497 + }, + { + "epoch": 0.31341671602271537, + "grad_norm": 0.6867096727586379, + "learning_rate": 5.592406006652241e-07, + "loss": 1.4298, + "step": 4498 + }, + { + "epoch": 0.3134863951503327, + "grad_norm": 0.7457698816149559, + "learning_rate": 5.591831494507271e-07, + "loss": 1.5903, + "step": 4499 + }, + { + "epoch": 0.31355607427795, + "grad_norm": 0.7503797775935128, + "learning_rate": 5.591256898890162e-07, + "loss": 1.5962, + "step": 4500 + }, + { + "epoch": 0.31362575340556736, + "grad_norm": 0.7085594299784876, + "learning_rate": 5.590682219828446e-07, + "loss": 1.5023, + "step": 4501 + }, + { + "epoch": 0.31369543253318466, + "grad_norm": 0.7243760005682286, + "learning_rate": 5.590107457349667e-07, + "loss": 1.6518, + "step": 4502 + }, + { + "epoch": 0.313765111660802, + "grad_norm": 0.7448276470045246, + "learning_rate": 5.589532611481363e-07, + "loss": 1.6848, + "step": 4503 + }, + { + "epoch": 0.3138347907884193, + "grad_norm": 0.7494794861319963, + "learning_rate": 5.588957682251087e-07, + "loss": 1.5629, + "step": 4504 + }, + { + "epoch": 0.31390446991603665, + "grad_norm": 0.7213716227937492, + "learning_rate": 5.588382669686389e-07, + "loss": 1.5777, + "step": 4505 + }, + { + "epoch": 0.31397414904365395, + "grad_norm": 0.7646428526365497, + "learning_rate": 5.587807573814824e-07, + "loss": 1.5049, + "step": 4506 + }, + { + "epoch": 0.3140438281712713, + "grad_norm": 0.767120792824794, + "learning_rate": 5.587232394663951e-07, + "loss": 1.4491, + "step": 4507 + }, + { + "epoch": 0.3141135072988886, + "grad_norm": 0.7105591775635199, + "learning_rate": 5.586657132261337e-07, + "loss": 1.4434, + "step": 4508 + }, + { + "epoch": 0.31418318642650594, + "grad_norm": 0.758203714015006, + "learning_rate": 5.586081786634549e-07, + "loss": 1.7566, + "step": 4509 + }, + { + "epoch": 0.31425286555412324, + "grad_norm": 0.7786093682246001, + "learning_rate": 5.585506357811156e-07, + "loss": 1.5759, + "step": 4510 + }, + { + "epoch": 0.3143225446817406, + "grad_norm": 0.6442824769449441, + "learning_rate": 5.584930845818736e-07, + "loss": 1.5383, + "step": 4511 + }, + { + "epoch": 0.3143922238093579, + "grad_norm": 0.6987033992122678, + "learning_rate": 5.584355250684867e-07, + "loss": 1.4782, + "step": 4512 + }, + { + "epoch": 0.31446190293697523, + "grad_norm": 0.7123821044662978, + "learning_rate": 5.583779572437135e-07, + "loss": 1.4858, + "step": 4513 + }, + { + "epoch": 0.31453158206459253, + "grad_norm": 0.7612746697091406, + "learning_rate": 5.583203811103125e-07, + "loss": 1.5794, + "step": 4514 + }, + { + "epoch": 0.3146012611922099, + "grad_norm": 0.6833845167460846, + "learning_rate": 5.582627966710432e-07, + "loss": 1.4549, + "step": 4515 + }, + { + "epoch": 0.3146709403198272, + "grad_norm": 0.7935778077228008, + "learning_rate": 5.582052039286649e-07, + "loss": 1.6172, + "step": 4516 + }, + { + "epoch": 0.3147406194474445, + "grad_norm": 0.7301222093535693, + "learning_rate": 5.581476028859377e-07, + "loss": 1.5544, + "step": 4517 + }, + { + "epoch": 0.3148102985750618, + "grad_norm": 0.7456619945461759, + "learning_rate": 5.580899935456218e-07, + "loss": 1.588, + "step": 4518 + }, + { + "epoch": 0.31487997770267917, + "grad_norm": 0.7179618775554296, + "learning_rate": 5.580323759104781e-07, + "loss": 1.5222, + "step": 4519 + }, + { + "epoch": 0.31494965683029646, + "grad_norm": 0.7132286895714686, + "learning_rate": 5.579747499832679e-07, + "loss": 1.434, + "step": 4520 + }, + { + "epoch": 0.3150193359579138, + "grad_norm": 0.7465885391719073, + "learning_rate": 5.579171157667522e-07, + "loss": 1.5557, + "step": 4521 + }, + { + "epoch": 0.3150890150855311, + "grad_norm": 0.749662806631823, + "learning_rate": 5.578594732636936e-07, + "loss": 1.6624, + "step": 4522 + }, + { + "epoch": 0.31515869421314846, + "grad_norm": 0.949124921903201, + "learning_rate": 5.578018224768542e-07, + "loss": 1.4182, + "step": 4523 + }, + { + "epoch": 0.31522837334076576, + "grad_norm": 0.744353130865854, + "learning_rate": 5.577441634089965e-07, + "loss": 1.6209, + "step": 4524 + }, + { + "epoch": 0.3152980524683831, + "grad_norm": 0.6873792099313955, + "learning_rate": 5.576864960628839e-07, + "loss": 1.4797, + "step": 4525 + }, + { + "epoch": 0.3153677315960004, + "grad_norm": 0.6994386653153584, + "learning_rate": 5.5762882044128e-07, + "loss": 1.4613, + "step": 4526 + }, + { + "epoch": 0.31543741072361775, + "grad_norm": 0.705522154107607, + "learning_rate": 5.575711365469486e-07, + "loss": 1.6182, + "step": 4527 + }, + { + "epoch": 0.31550708985123505, + "grad_norm": 0.7725865357784353, + "learning_rate": 5.57513444382654e-07, + "loss": 1.569, + "step": 4528 + }, + { + "epoch": 0.3155767689788524, + "grad_norm": 0.6845243748298593, + "learning_rate": 5.574557439511612e-07, + "loss": 1.5217, + "step": 4529 + }, + { + "epoch": 0.3156464481064697, + "grad_norm": 0.7261032424467299, + "learning_rate": 5.573980352552348e-07, + "loss": 1.6041, + "step": 4530 + }, + { + "epoch": 0.31571612723408704, + "grad_norm": 0.773279494758788, + "learning_rate": 5.573403182976408e-07, + "loss": 1.6184, + "step": 4531 + }, + { + "epoch": 0.31578580636170434, + "grad_norm": 0.9622361724689067, + "learning_rate": 5.572825930811449e-07, + "loss": 1.6083, + "step": 4532 + }, + { + "epoch": 0.3158554854893217, + "grad_norm": 0.7432584934801373, + "learning_rate": 5.572248596085133e-07, + "loss": 1.436, + "step": 4533 + }, + { + "epoch": 0.315925164616939, + "grad_norm": 0.7427295237224485, + "learning_rate": 5.571671178825131e-07, + "loss": 1.5585, + "step": 4534 + }, + { + "epoch": 0.31599484374455633, + "grad_norm": 0.6928269582168132, + "learning_rate": 5.57109367905911e-07, + "loss": 1.4105, + "step": 4535 + }, + { + "epoch": 0.3160645228721736, + "grad_norm": 0.6728242767255252, + "learning_rate": 5.570516096814747e-07, + "loss": 1.5522, + "step": 4536 + }, + { + "epoch": 0.316134201999791, + "grad_norm": 0.7263367317524648, + "learning_rate": 5.569938432119721e-07, + "loss": 1.6393, + "step": 4537 + }, + { + "epoch": 0.31620388112740827, + "grad_norm": 0.715112801174288, + "learning_rate": 5.569360685001715e-07, + "loss": 1.4807, + "step": 4538 + }, + { + "epoch": 0.3162735602550256, + "grad_norm": 0.7492932996364073, + "learning_rate": 5.568782855488413e-07, + "loss": 1.6866, + "step": 4539 + }, + { + "epoch": 0.3163432393826429, + "grad_norm": 0.6986507984234215, + "learning_rate": 5.568204943607508e-07, + "loss": 1.5787, + "step": 4540 + }, + { + "epoch": 0.31641291851026027, + "grad_norm": 0.6888654643145271, + "learning_rate": 5.567626949386696e-07, + "loss": 1.6045, + "step": 4541 + }, + { + "epoch": 0.31648259763787756, + "grad_norm": 0.6805162691732184, + "learning_rate": 5.567048872853675e-07, + "loss": 1.5848, + "step": 4542 + }, + { + "epoch": 0.3165522767654949, + "grad_norm": 0.6884368305487023, + "learning_rate": 5.566470714036145e-07, + "loss": 1.55, + "step": 4543 + }, + { + "epoch": 0.3166219558931122, + "grad_norm": 0.7570329409020771, + "learning_rate": 5.565892472961816e-07, + "loss": 1.6468, + "step": 4544 + }, + { + "epoch": 0.31669163502072956, + "grad_norm": 0.7059467717662286, + "learning_rate": 5.565314149658398e-07, + "loss": 1.4445, + "step": 4545 + }, + { + "epoch": 0.31676131414834685, + "grad_norm": 0.6657695365455025, + "learning_rate": 5.564735744153601e-07, + "loss": 1.4957, + "step": 4546 + }, + { + "epoch": 0.3168309932759642, + "grad_norm": 0.7407102775583433, + "learning_rate": 5.56415725647515e-07, + "loss": 1.5459, + "step": 4547 + }, + { + "epoch": 0.3169006724035815, + "grad_norm": 0.7106313167914474, + "learning_rate": 5.563578686650763e-07, + "loss": 1.5298, + "step": 4548 + }, + { + "epoch": 0.31697035153119885, + "grad_norm": 0.682333055169887, + "learning_rate": 5.563000034708168e-07, + "loss": 1.5829, + "step": 4549 + }, + { + "epoch": 0.31704003065881614, + "grad_norm": 0.7192818302564468, + "learning_rate": 5.562421300675094e-07, + "loss": 1.419, + "step": 4550 + }, + { + "epoch": 0.3171097097864335, + "grad_norm": 0.7430656876330435, + "learning_rate": 5.561842484579276e-07, + "loss": 1.5114, + "step": 4551 + }, + { + "epoch": 0.3171793889140508, + "grad_norm": 0.7111699564860365, + "learning_rate": 5.561263586448452e-07, + "loss": 1.6344, + "step": 4552 + }, + { + "epoch": 0.31724906804166814, + "grad_norm": 0.7041991520578657, + "learning_rate": 5.560684606310363e-07, + "loss": 1.4087, + "step": 4553 + }, + { + "epoch": 0.31731874716928543, + "grad_norm": 0.6955203999717197, + "learning_rate": 5.560105544192756e-07, + "loss": 1.5001, + "step": 4554 + }, + { + "epoch": 0.3173884262969028, + "grad_norm": 0.642220065364471, + "learning_rate": 5.559526400123382e-07, + "loss": 1.3332, + "step": 4555 + }, + { + "epoch": 0.3174581054245201, + "grad_norm": 0.7146233847081206, + "learning_rate": 5.558947174129991e-07, + "loss": 1.5475, + "step": 4556 + }, + { + "epoch": 0.3175277845521374, + "grad_norm": 0.6547314881825679, + "learning_rate": 5.558367866240346e-07, + "loss": 1.451, + "step": 4557 + }, + { + "epoch": 0.3175974636797547, + "grad_norm": 0.7557513478775116, + "learning_rate": 5.557788476482202e-07, + "loss": 1.512, + "step": 4558 + }, + { + "epoch": 0.31766714280737207, + "grad_norm": 0.7504684331261248, + "learning_rate": 5.557209004883331e-07, + "loss": 1.4838, + "step": 4559 + }, + { + "epoch": 0.31773682193498937, + "grad_norm": 0.6641025593399148, + "learning_rate": 5.556629451471498e-07, + "loss": 1.4582, + "step": 4560 + }, + { + "epoch": 0.3178065010626067, + "grad_norm": 0.6858127362399964, + "learning_rate": 5.556049816274479e-07, + "loss": 1.4477, + "step": 4561 + }, + { + "epoch": 0.317876180190224, + "grad_norm": 0.7816268642559431, + "learning_rate": 5.555470099320049e-07, + "loss": 1.4744, + "step": 4562 + }, + { + "epoch": 0.31794585931784136, + "grad_norm": 0.7280784746335369, + "learning_rate": 5.554890300635992e-07, + "loss": 1.602, + "step": 4563 + }, + { + "epoch": 0.31801553844545866, + "grad_norm": 0.7235496817025429, + "learning_rate": 5.554310420250091e-07, + "loss": 1.5873, + "step": 4564 + }, + { + "epoch": 0.318085217573076, + "grad_norm": 0.7145943660212166, + "learning_rate": 5.553730458190136e-07, + "loss": 1.5606, + "step": 4565 + }, + { + "epoch": 0.3181548967006933, + "grad_norm": 0.7405716180465298, + "learning_rate": 5.55315041448392e-07, + "loss": 1.5548, + "step": 4566 + }, + { + "epoch": 0.31822457582831065, + "grad_norm": 0.699821597252651, + "learning_rate": 5.55257028915924e-07, + "loss": 1.5007, + "step": 4567 + }, + { + "epoch": 0.31829425495592795, + "grad_norm": 0.7264404039646922, + "learning_rate": 5.551990082243896e-07, + "loss": 1.5778, + "step": 4568 + }, + { + "epoch": 0.3183639340835453, + "grad_norm": 0.7018136293753561, + "learning_rate": 5.551409793765692e-07, + "loss": 1.5822, + "step": 4569 + }, + { + "epoch": 0.3184336132111626, + "grad_norm": 0.7085191412090573, + "learning_rate": 5.55082942375244e-07, + "loss": 1.566, + "step": 4570 + }, + { + "epoch": 0.31850329233877994, + "grad_norm": 0.6959965920230744, + "learning_rate": 5.550248972231949e-07, + "loss": 1.424, + "step": 4571 + }, + { + "epoch": 0.31857297146639724, + "grad_norm": 0.7573753306961281, + "learning_rate": 5.549668439232036e-07, + "loss": 1.7044, + "step": 4572 + }, + { + "epoch": 0.3186426505940146, + "grad_norm": 0.7796455158742204, + "learning_rate": 5.549087824780523e-07, + "loss": 1.4707, + "step": 4573 + }, + { + "epoch": 0.3187123297216319, + "grad_norm": 0.8073763532584914, + "learning_rate": 5.548507128905233e-07, + "loss": 1.6353, + "step": 4574 + }, + { + "epoch": 0.31878200884924923, + "grad_norm": 0.6759903199641083, + "learning_rate": 5.547926351633995e-07, + "loss": 1.4381, + "step": 4575 + }, + { + "epoch": 0.3188516879768665, + "grad_norm": 0.8112029098972438, + "learning_rate": 5.54734549299464e-07, + "loss": 1.4541, + "step": 4576 + }, + { + "epoch": 0.3189213671044839, + "grad_norm": 0.668927646315414, + "learning_rate": 5.546764553015004e-07, + "loss": 1.5504, + "step": 4577 + }, + { + "epoch": 0.31899104623210117, + "grad_norm": 0.728448386639013, + "learning_rate": 5.546183531722927e-07, + "loss": 1.6189, + "step": 4578 + }, + { + "epoch": 0.3190607253597185, + "grad_norm": 0.7022708932585651, + "learning_rate": 5.545602429146254e-07, + "loss": 1.4315, + "step": 4579 + }, + { + "epoch": 0.3191304044873358, + "grad_norm": 0.7663878863520591, + "learning_rate": 5.54502124531283e-07, + "loss": 1.5433, + "step": 4580 + }, + { + "epoch": 0.31920008361495317, + "grad_norm": 0.7464042904379575, + "learning_rate": 5.544439980250511e-07, + "loss": 1.7276, + "step": 4581 + }, + { + "epoch": 0.31926976274257046, + "grad_norm": 0.761958715868217, + "learning_rate": 5.543858633987147e-07, + "loss": 1.4849, + "step": 4582 + }, + { + "epoch": 0.3193394418701878, + "grad_norm": 0.7402266957550955, + "learning_rate": 5.5432772065506e-07, + "loss": 1.619, + "step": 4583 + }, + { + "epoch": 0.3194091209978051, + "grad_norm": 0.676648377109348, + "learning_rate": 5.542695697968735e-07, + "loss": 1.5384, + "step": 4584 + }, + { + "epoch": 0.31947880012542246, + "grad_norm": 0.7194002923819981, + "learning_rate": 5.542114108269416e-07, + "loss": 1.499, + "step": 4585 + }, + { + "epoch": 0.31954847925303975, + "grad_norm": 0.7094758253955392, + "learning_rate": 5.541532437480514e-07, + "loss": 1.5803, + "step": 4586 + }, + { + "epoch": 0.31961815838065705, + "grad_norm": 0.6828922434293256, + "learning_rate": 5.540950685629905e-07, + "loss": 1.4883, + "step": 4587 + }, + { + "epoch": 0.3196878375082744, + "grad_norm": 0.7289414195466307, + "learning_rate": 5.540368852745469e-07, + "loss": 1.6278, + "step": 4588 + }, + { + "epoch": 0.3197575166358917, + "grad_norm": 0.7408149230732806, + "learning_rate": 5.539786938855087e-07, + "loss": 1.5954, + "step": 4589 + }, + { + "epoch": 0.31982719576350904, + "grad_norm": 0.6675256658590901, + "learning_rate": 5.539204943986645e-07, + "loss": 1.517, + "step": 4590 + }, + { + "epoch": 0.31989687489112634, + "grad_norm": 0.6800644937580433, + "learning_rate": 5.538622868168034e-07, + "loss": 1.5081, + "step": 4591 + }, + { + "epoch": 0.3199665540187437, + "grad_norm": 0.729421915525262, + "learning_rate": 5.53804071142715e-07, + "loss": 1.609, + "step": 4592 + }, + { + "epoch": 0.320036233146361, + "grad_norm": 0.6810241293051824, + "learning_rate": 5.537458473791889e-07, + "loss": 1.4824, + "step": 4593 + }, + { + "epoch": 0.32010591227397833, + "grad_norm": 0.6577963086195658, + "learning_rate": 5.536876155290153e-07, + "loss": 1.4177, + "step": 4594 + }, + { + "epoch": 0.3201755914015956, + "grad_norm": 0.7461821891392836, + "learning_rate": 5.53629375594985e-07, + "loss": 1.5158, + "step": 4595 + }, + { + "epoch": 0.320245270529213, + "grad_norm": 0.7391443519888712, + "learning_rate": 5.535711275798887e-07, + "loss": 1.4028, + "step": 4596 + }, + { + "epoch": 0.3203149496568303, + "grad_norm": 0.7217676239781258, + "learning_rate": 5.53512871486518e-07, + "loss": 1.6136, + "step": 4597 + }, + { + "epoch": 0.3203846287844476, + "grad_norm": 0.7728777338722576, + "learning_rate": 5.534546073176645e-07, + "loss": 1.4466, + "step": 4598 + }, + { + "epoch": 0.3204543079120649, + "grad_norm": 0.6624023550793688, + "learning_rate": 5.533963350761203e-07, + "loss": 1.5125, + "step": 4599 + }, + { + "epoch": 0.32052398703968227, + "grad_norm": 0.7744005505294767, + "learning_rate": 5.533380547646781e-07, + "loss": 1.4948, + "step": 4600 + }, + { + "epoch": 0.32059366616729956, + "grad_norm": 0.6691477498617161, + "learning_rate": 5.532797663861307e-07, + "loss": 1.5916, + "step": 4601 + }, + { + "epoch": 0.3206633452949169, + "grad_norm": 0.6918371725583984, + "learning_rate": 5.532214699432715e-07, + "loss": 1.5005, + "step": 4602 + }, + { + "epoch": 0.3207330244225342, + "grad_norm": 0.7497768594055559, + "learning_rate": 5.53163165438894e-07, + "loss": 1.4548, + "step": 4603 + }, + { + "epoch": 0.32080270355015156, + "grad_norm": 0.7488556365884924, + "learning_rate": 5.531048528757924e-07, + "loss": 1.5472, + "step": 4604 + }, + { + "epoch": 0.32087238267776885, + "grad_norm": 0.7579239463318133, + "learning_rate": 5.530465322567612e-07, + "loss": 1.5825, + "step": 4605 + }, + { + "epoch": 0.3209420618053862, + "grad_norm": 0.7919011629759329, + "learning_rate": 5.529882035845952e-07, + "loss": 1.5038, + "step": 4606 + }, + { + "epoch": 0.3210117409330035, + "grad_norm": 0.7674975520767843, + "learning_rate": 5.529298668620894e-07, + "loss": 1.6024, + "step": 4607 + }, + { + "epoch": 0.32108142006062085, + "grad_norm": 0.7096403207271622, + "learning_rate": 5.528715220920397e-07, + "loss": 1.5253, + "step": 4608 + }, + { + "epoch": 0.32115109918823814, + "grad_norm": 0.7134317507024582, + "learning_rate": 5.528131692772423e-07, + "loss": 1.6089, + "step": 4609 + }, + { + "epoch": 0.3212207783158555, + "grad_norm": 0.7671723571840533, + "learning_rate": 5.52754808420493e-07, + "loss": 1.6016, + "step": 4610 + }, + { + "epoch": 0.3212904574434728, + "grad_norm": 0.688495820899462, + "learning_rate": 5.52696439524589e-07, + "loss": 1.5726, + "step": 4611 + }, + { + "epoch": 0.32136013657109014, + "grad_norm": 0.6678380593790649, + "learning_rate": 5.526380625923274e-07, + "loss": 1.5112, + "step": 4612 + }, + { + "epoch": 0.32142981569870743, + "grad_norm": 0.745762956582063, + "learning_rate": 5.525796776265057e-07, + "loss": 1.5177, + "step": 4613 + }, + { + "epoch": 0.3214994948263248, + "grad_norm": 0.7112486718285385, + "learning_rate": 5.525212846299217e-07, + "loss": 1.5147, + "step": 4614 + }, + { + "epoch": 0.3215691739539421, + "grad_norm": 0.724836975919717, + "learning_rate": 5.524628836053739e-07, + "loss": 1.6959, + "step": 4615 + }, + { + "epoch": 0.32163885308155943, + "grad_norm": 0.6913389692644148, + "learning_rate": 5.524044745556608e-07, + "loss": 1.5525, + "step": 4616 + }, + { + "epoch": 0.3217085322091767, + "grad_norm": 0.7223242124116028, + "learning_rate": 5.523460574835818e-07, + "loss": 1.6182, + "step": 4617 + }, + { + "epoch": 0.3217782113367941, + "grad_norm": 0.6972269063425547, + "learning_rate": 5.52287632391936e-07, + "loss": 1.614, + "step": 4618 + }, + { + "epoch": 0.32184789046441137, + "grad_norm": 0.6855602585457429, + "learning_rate": 5.522291992835234e-07, + "loss": 1.5322, + "step": 4619 + }, + { + "epoch": 0.3219175695920287, + "grad_norm": 0.7295488072661531, + "learning_rate": 5.521707581611445e-07, + "loss": 1.4599, + "step": 4620 + }, + { + "epoch": 0.321987248719646, + "grad_norm": 0.7602169389635018, + "learning_rate": 5.521123090275996e-07, + "loss": 1.4392, + "step": 4621 + }, + { + "epoch": 0.32205692784726336, + "grad_norm": 0.7611538203907603, + "learning_rate": 5.520538518856896e-07, + "loss": 1.6624, + "step": 4622 + }, + { + "epoch": 0.32212660697488066, + "grad_norm": 0.7382468747722052, + "learning_rate": 5.519953867382163e-07, + "loss": 1.5633, + "step": 4623 + }, + { + "epoch": 0.322196286102498, + "grad_norm": 0.6877266006916888, + "learning_rate": 5.51936913587981e-07, + "loss": 1.4824, + "step": 4624 + }, + { + "epoch": 0.3222659652301153, + "grad_norm": 0.7238249354166105, + "learning_rate": 5.518784324377861e-07, + "loss": 1.6241, + "step": 4625 + }, + { + "epoch": 0.32233564435773265, + "grad_norm": 0.7876389933251687, + "learning_rate": 5.518199432904342e-07, + "loss": 1.6126, + "step": 4626 + }, + { + "epoch": 0.32240532348534995, + "grad_norm": 0.7244708081210876, + "learning_rate": 5.517614461487283e-07, + "loss": 1.57, + "step": 4627 + }, + { + "epoch": 0.3224750026129673, + "grad_norm": 0.6877563899579893, + "learning_rate": 5.517029410154713e-07, + "loss": 1.4658, + "step": 4628 + }, + { + "epoch": 0.3225446817405846, + "grad_norm": 0.7381068738510921, + "learning_rate": 5.516444278934672e-07, + "loss": 1.472, + "step": 4629 + }, + { + "epoch": 0.32261436086820194, + "grad_norm": 0.7661324152373148, + "learning_rate": 5.5158590678552e-07, + "loss": 1.4738, + "step": 4630 + }, + { + "epoch": 0.32268403999581924, + "grad_norm": 0.7180663750359986, + "learning_rate": 5.515273776944343e-07, + "loss": 1.4249, + "step": 4631 + }, + { + "epoch": 0.3227537191234366, + "grad_norm": 0.7314590378305785, + "learning_rate": 5.514688406230145e-07, + "loss": 1.5152, + "step": 4632 + }, + { + "epoch": 0.3228233982510539, + "grad_norm": 0.6801266282448725, + "learning_rate": 5.514102955740663e-07, + "loss": 1.5268, + "step": 4633 + }, + { + "epoch": 0.32289307737867123, + "grad_norm": 0.7305814534042752, + "learning_rate": 5.51351742550395e-07, + "loss": 1.495, + "step": 4634 + }, + { + "epoch": 0.32296275650628853, + "grad_norm": 0.6815117848190505, + "learning_rate": 5.512931815548069e-07, + "loss": 1.4861, + "step": 4635 + }, + { + "epoch": 0.3230324356339059, + "grad_norm": 0.7455133929413991, + "learning_rate": 5.512346125901079e-07, + "loss": 1.4462, + "step": 4636 + }, + { + "epoch": 0.3231021147615232, + "grad_norm": 0.7508120732279528, + "learning_rate": 5.511760356591052e-07, + "loss": 1.5483, + "step": 4637 + }, + { + "epoch": 0.3231717938891405, + "grad_norm": 0.7014812959074661, + "learning_rate": 5.511174507646055e-07, + "loss": 1.609, + "step": 4638 + }, + { + "epoch": 0.3232414730167578, + "grad_norm": 0.7421777110315984, + "learning_rate": 5.510588579094168e-07, + "loss": 1.4491, + "step": 4639 + }, + { + "epoch": 0.32331115214437517, + "grad_norm": 0.7827639342952755, + "learning_rate": 5.510002570963465e-07, + "loss": 1.4477, + "step": 4640 + }, + { + "epoch": 0.32338083127199246, + "grad_norm": 0.6862468086728646, + "learning_rate": 5.50941648328203e-07, + "loss": 1.5549, + "step": 4641 + }, + { + "epoch": 0.3234505103996098, + "grad_norm": 0.7260160638503516, + "learning_rate": 5.508830316077952e-07, + "loss": 1.4935, + "step": 4642 + }, + { + "epoch": 0.3235201895272271, + "grad_norm": 0.6932870393561489, + "learning_rate": 5.508244069379321e-07, + "loss": 1.7095, + "step": 4643 + }, + { + "epoch": 0.32358986865484446, + "grad_norm": 0.7465469881600842, + "learning_rate": 5.507657743214228e-07, + "loss": 1.5113, + "step": 4644 + }, + { + "epoch": 0.32365954778246175, + "grad_norm": 0.724992133322777, + "learning_rate": 5.507071337610773e-07, + "loss": 1.6181, + "step": 4645 + }, + { + "epoch": 0.3237292269100791, + "grad_norm": 0.746052557847551, + "learning_rate": 5.506484852597058e-07, + "loss": 1.6747, + "step": 4646 + }, + { + "epoch": 0.3237989060376964, + "grad_norm": 0.7450971054334902, + "learning_rate": 5.505898288201188e-07, + "loss": 1.6415, + "step": 4647 + }, + { + "epoch": 0.32386858516531375, + "grad_norm": 0.7264260556915378, + "learning_rate": 5.505311644451272e-07, + "loss": 1.6594, + "step": 4648 + }, + { + "epoch": 0.32393826429293104, + "grad_norm": 0.7155597727480145, + "learning_rate": 5.504724921375425e-07, + "loss": 1.4577, + "step": 4649 + }, + { + "epoch": 0.3240079434205484, + "grad_norm": 0.7669193360173628, + "learning_rate": 5.504138119001761e-07, + "loss": 1.5443, + "step": 4650 + }, + { + "epoch": 0.3240776225481657, + "grad_norm": 0.8326790168924357, + "learning_rate": 5.503551237358404e-07, + "loss": 1.7411, + "step": 4651 + }, + { + "epoch": 0.32414730167578304, + "grad_norm": 0.7134212535930974, + "learning_rate": 5.502964276473477e-07, + "loss": 1.5207, + "step": 4652 + }, + { + "epoch": 0.32421698080340033, + "grad_norm": 0.7164271081334321, + "learning_rate": 5.502377236375108e-07, + "loss": 1.563, + "step": 4653 + }, + { + "epoch": 0.3242866599310177, + "grad_norm": 0.7574513017908499, + "learning_rate": 5.501790117091429e-07, + "loss": 1.5626, + "step": 4654 + }, + { + "epoch": 0.324356339058635, + "grad_norm": 0.6950388343278109, + "learning_rate": 5.501202918650577e-07, + "loss": 1.5222, + "step": 4655 + }, + { + "epoch": 0.32442601818625233, + "grad_norm": 0.7125678417544166, + "learning_rate": 5.500615641080691e-07, + "loss": 1.6304, + "step": 4656 + }, + { + "epoch": 0.3244956973138696, + "grad_norm": 0.6998624739958945, + "learning_rate": 5.500028284409915e-07, + "loss": 1.5324, + "step": 4657 + }, + { + "epoch": 0.324565376441487, + "grad_norm": 0.7068349491161948, + "learning_rate": 5.499440848666395e-07, + "loss": 1.5863, + "step": 4658 + }, + { + "epoch": 0.32463505556910427, + "grad_norm": 0.661201049428343, + "learning_rate": 5.498853333878285e-07, + "loss": 1.4551, + "step": 4659 + }, + { + "epoch": 0.3247047346967216, + "grad_norm": 0.7269218043387352, + "learning_rate": 5.498265740073738e-07, + "loss": 1.5651, + "step": 4660 + }, + { + "epoch": 0.3247744138243389, + "grad_norm": 0.749285337503175, + "learning_rate": 5.497678067280913e-07, + "loss": 1.7859, + "step": 4661 + }, + { + "epoch": 0.32484409295195626, + "grad_norm": 0.7480084626513137, + "learning_rate": 5.497090315527971e-07, + "loss": 1.5376, + "step": 4662 + }, + { + "epoch": 0.32491377207957356, + "grad_norm": 0.6844373648201154, + "learning_rate": 5.496502484843082e-07, + "loss": 1.4983, + "step": 4663 + }, + { + "epoch": 0.3249834512071909, + "grad_norm": 0.8226873965644259, + "learning_rate": 5.495914575254411e-07, + "loss": 1.7003, + "step": 4664 + }, + { + "epoch": 0.3250531303348082, + "grad_norm": 0.6728335863273786, + "learning_rate": 5.495326586790137e-07, + "loss": 1.5181, + "step": 4665 + }, + { + "epoch": 0.32512280946242555, + "grad_norm": 0.7469348771533575, + "learning_rate": 5.494738519478434e-07, + "loss": 1.6188, + "step": 4666 + }, + { + "epoch": 0.32519248859004285, + "grad_norm": 0.7149477160520245, + "learning_rate": 5.494150373347485e-07, + "loss": 1.5103, + "step": 4667 + }, + { + "epoch": 0.3252621677176602, + "grad_norm": 0.7543294043052631, + "learning_rate": 5.493562148425475e-07, + "loss": 1.531, + "step": 4668 + }, + { + "epoch": 0.3253318468452775, + "grad_norm": 0.8562125075394144, + "learning_rate": 5.492973844740592e-07, + "loss": 1.5215, + "step": 4669 + }, + { + "epoch": 0.32540152597289485, + "grad_norm": 0.7576430297954465, + "learning_rate": 5.492385462321028e-07, + "loss": 1.4913, + "step": 4670 + }, + { + "epoch": 0.32547120510051214, + "grad_norm": 0.693823950005331, + "learning_rate": 5.491797001194984e-07, + "loss": 1.5224, + "step": 4671 + }, + { + "epoch": 0.3255408842281295, + "grad_norm": 0.7327177721134271, + "learning_rate": 5.491208461390654e-07, + "loss": 1.4314, + "step": 4672 + }, + { + "epoch": 0.3256105633557468, + "grad_norm": 0.7566433006080883, + "learning_rate": 5.490619842936248e-07, + "loss": 1.6511, + "step": 4673 + }, + { + "epoch": 0.32568024248336414, + "grad_norm": 0.7885667972779885, + "learning_rate": 5.490031145859969e-07, + "loss": 1.7014, + "step": 4674 + }, + { + "epoch": 0.32574992161098143, + "grad_norm": 0.7768801147957153, + "learning_rate": 5.489442370190032e-07, + "loss": 1.5669, + "step": 4675 + }, + { + "epoch": 0.3258196007385988, + "grad_norm": 0.720878744425241, + "learning_rate": 5.488853515954651e-07, + "loss": 1.5863, + "step": 4676 + }, + { + "epoch": 0.3258892798662161, + "grad_norm": 0.6862498975781554, + "learning_rate": 5.488264583182043e-07, + "loss": 1.6059, + "step": 4677 + }, + { + "epoch": 0.32595895899383337, + "grad_norm": 0.7409461109224728, + "learning_rate": 5.487675571900435e-07, + "loss": 1.5253, + "step": 4678 + }, + { + "epoch": 0.3260286381214507, + "grad_norm": 0.7660810857821033, + "learning_rate": 5.48708648213805e-07, + "loss": 1.5677, + "step": 4679 + }, + { + "epoch": 0.326098317249068, + "grad_norm": 0.709707457530693, + "learning_rate": 5.486497313923121e-07, + "loss": 1.4906, + "step": 4680 + }, + { + "epoch": 0.32616799637668537, + "grad_norm": 0.7039157582721269, + "learning_rate": 5.48590806728388e-07, + "loss": 1.642, + "step": 4681 + }, + { + "epoch": 0.32623767550430266, + "grad_norm": 0.8055219249645671, + "learning_rate": 5.485318742248567e-07, + "loss": 1.5276, + "step": 4682 + }, + { + "epoch": 0.32630735463192, + "grad_norm": 0.7082044224744788, + "learning_rate": 5.484729338845422e-07, + "loss": 1.5328, + "step": 4683 + }, + { + "epoch": 0.3263770337595373, + "grad_norm": 0.7195194869776044, + "learning_rate": 5.484139857102691e-07, + "loss": 1.564, + "step": 4684 + }, + { + "epoch": 0.32644671288715466, + "grad_norm": 0.7260360051203139, + "learning_rate": 5.483550297048624e-07, + "loss": 1.5654, + "step": 4685 + }, + { + "epoch": 0.32651639201477195, + "grad_norm": 0.7095863416019655, + "learning_rate": 5.482960658711472e-07, + "loss": 1.6385, + "step": 4686 + }, + { + "epoch": 0.3265860711423893, + "grad_norm": 0.7089680351775876, + "learning_rate": 5.482370942119494e-07, + "loss": 1.5918, + "step": 4687 + }, + { + "epoch": 0.3266557502700066, + "grad_norm": 0.7754253458635536, + "learning_rate": 5.481781147300948e-07, + "loss": 1.5457, + "step": 4688 + }, + { + "epoch": 0.32672542939762395, + "grad_norm": 0.7133933783068345, + "learning_rate": 5.481191274284101e-07, + "loss": 1.4844, + "step": 4689 + }, + { + "epoch": 0.32679510852524124, + "grad_norm": 0.7290333114124878, + "learning_rate": 5.480601323097218e-07, + "loss": 1.6388, + "step": 4690 + }, + { + "epoch": 0.3268647876528586, + "grad_norm": 0.7342510780107464, + "learning_rate": 5.480011293768572e-07, + "loss": 1.4667, + "step": 4691 + }, + { + "epoch": 0.3269344667804759, + "grad_norm": 0.691124208728813, + "learning_rate": 5.479421186326439e-07, + "loss": 1.4353, + "step": 4692 + }, + { + "epoch": 0.32700414590809324, + "grad_norm": 0.7409084935108537, + "learning_rate": 5.478831000799098e-07, + "loss": 1.4188, + "step": 4693 + }, + { + "epoch": 0.32707382503571053, + "grad_norm": 0.7581250592188127, + "learning_rate": 5.478240737214831e-07, + "loss": 1.4854, + "step": 4694 + }, + { + "epoch": 0.3271435041633279, + "grad_norm": 0.6916383712026993, + "learning_rate": 5.477650395601926e-07, + "loss": 1.6155, + "step": 4695 + }, + { + "epoch": 0.3272131832909452, + "grad_norm": 0.7525082735725286, + "learning_rate": 5.477059975988671e-07, + "loss": 1.6562, + "step": 4696 + }, + { + "epoch": 0.3272828624185625, + "grad_norm": 0.6850325687097222, + "learning_rate": 5.476469478403363e-07, + "loss": 1.5095, + "step": 4697 + }, + { + "epoch": 0.3273525415461798, + "grad_norm": 0.6911370545439388, + "learning_rate": 5.475878902874298e-07, + "loss": 1.5014, + "step": 4698 + }, + { + "epoch": 0.32742222067379717, + "grad_norm": 0.6779251995747619, + "learning_rate": 5.475288249429777e-07, + "loss": 1.521, + "step": 4699 + }, + { + "epoch": 0.32749189980141447, + "grad_norm": 0.6777351655241748, + "learning_rate": 5.474697518098108e-07, + "loss": 1.5156, + "step": 4700 + }, + { + "epoch": 0.3275615789290318, + "grad_norm": 0.6809433325320242, + "learning_rate": 5.4741067089076e-07, + "loss": 1.5013, + "step": 4701 + }, + { + "epoch": 0.3276312580566491, + "grad_norm": 0.7119144281537158, + "learning_rate": 5.47351582188656e-07, + "loss": 1.5415, + "step": 4702 + }, + { + "epoch": 0.32770093718426646, + "grad_norm": 0.6613717045837818, + "learning_rate": 5.472924857063311e-07, + "loss": 1.4926, + "step": 4703 + }, + { + "epoch": 0.32777061631188376, + "grad_norm": 0.7288042034986972, + "learning_rate": 5.472333814466173e-07, + "loss": 1.4969, + "step": 4704 + }, + { + "epoch": 0.3278402954395011, + "grad_norm": 0.6995801218556058, + "learning_rate": 5.471742694123465e-07, + "loss": 1.4725, + "step": 4705 + }, + { + "epoch": 0.3279099745671184, + "grad_norm": 0.6891604297815477, + "learning_rate": 5.471151496063519e-07, + "loss": 1.4852, + "step": 4706 + }, + { + "epoch": 0.32797965369473575, + "grad_norm": 0.7077304381998237, + "learning_rate": 5.470560220314666e-07, + "loss": 1.5236, + "step": 4707 + }, + { + "epoch": 0.32804933282235305, + "grad_norm": 0.6457256925404681, + "learning_rate": 5.469968866905239e-07, + "loss": 1.5301, + "step": 4708 + }, + { + "epoch": 0.3281190119499704, + "grad_norm": 0.7383356695325461, + "learning_rate": 5.469377435863577e-07, + "loss": 1.5252, + "step": 4709 + }, + { + "epoch": 0.3281886910775877, + "grad_norm": 0.7226731345944863, + "learning_rate": 5.468785927218026e-07, + "loss": 1.5984, + "step": 4710 + }, + { + "epoch": 0.32825837020520504, + "grad_norm": 0.7351157644434121, + "learning_rate": 5.468194340996929e-07, + "loss": 1.662, + "step": 4711 + }, + { + "epoch": 0.32832804933282234, + "grad_norm": 0.6957818071174824, + "learning_rate": 5.467602677228638e-07, + "loss": 1.4365, + "step": 4712 + }, + { + "epoch": 0.3283977284604397, + "grad_norm": 0.7196864910738779, + "learning_rate": 5.467010935941507e-07, + "loss": 1.5894, + "step": 4713 + }, + { + "epoch": 0.328467407588057, + "grad_norm": 0.7146083912133652, + "learning_rate": 5.466419117163889e-07, + "loss": 1.4484, + "step": 4714 + }, + { + "epoch": 0.32853708671567433, + "grad_norm": 0.8106128842589259, + "learning_rate": 5.465827220924151e-07, + "loss": 1.4741, + "step": 4715 + }, + { + "epoch": 0.3286067658432916, + "grad_norm": 0.7103171764326403, + "learning_rate": 5.465235247250653e-07, + "loss": 1.508, + "step": 4716 + }, + { + "epoch": 0.328676444970909, + "grad_norm": 0.6812060402187873, + "learning_rate": 5.464643196171767e-07, + "loss": 1.4331, + "step": 4717 + }, + { + "epoch": 0.32874612409852627, + "grad_norm": 0.7177732946700219, + "learning_rate": 5.464051067715865e-07, + "loss": 1.4702, + "step": 4718 + }, + { + "epoch": 0.3288158032261436, + "grad_norm": 0.6958049083294975, + "learning_rate": 5.463458861911322e-07, + "loss": 1.4755, + "step": 4719 + }, + { + "epoch": 0.3288854823537609, + "grad_norm": 0.7325285859162157, + "learning_rate": 5.462866578786518e-07, + "loss": 1.4601, + "step": 4720 + }, + { + "epoch": 0.32895516148137827, + "grad_norm": 0.7582230823199709, + "learning_rate": 5.462274218369836e-07, + "loss": 1.4437, + "step": 4721 + }, + { + "epoch": 0.32902484060899556, + "grad_norm": 0.7210199222494685, + "learning_rate": 5.461681780689663e-07, + "loss": 1.5113, + "step": 4722 + }, + { + "epoch": 0.3290945197366129, + "grad_norm": 0.6795180257214272, + "learning_rate": 5.461089265774391e-07, + "loss": 1.4581, + "step": 4723 + }, + { + "epoch": 0.3291641988642302, + "grad_norm": 0.7355181900180564, + "learning_rate": 5.460496673652414e-07, + "loss": 1.5793, + "step": 4724 + }, + { + "epoch": 0.32923387799184756, + "grad_norm": 0.7418505821521018, + "learning_rate": 5.45990400435213e-07, + "loss": 1.5742, + "step": 4725 + }, + { + "epoch": 0.32930355711946485, + "grad_norm": 0.8148378311162706, + "learning_rate": 5.459311257901941e-07, + "loss": 1.5995, + "step": 4726 + }, + { + "epoch": 0.3293732362470822, + "grad_norm": 0.9894964109825274, + "learning_rate": 5.458718434330252e-07, + "loss": 1.4232, + "step": 4727 + }, + { + "epoch": 0.3294429153746995, + "grad_norm": 0.701725083846248, + "learning_rate": 5.458125533665475e-07, + "loss": 1.5014, + "step": 4728 + }, + { + "epoch": 0.32951259450231685, + "grad_norm": 0.7596048302356802, + "learning_rate": 5.45753255593602e-07, + "loss": 1.5457, + "step": 4729 + }, + { + "epoch": 0.32958227362993414, + "grad_norm": 0.6908157411025175, + "learning_rate": 5.456939501170304e-07, + "loss": 1.5558, + "step": 4730 + }, + { + "epoch": 0.3296519527575515, + "grad_norm": 0.7496144153051155, + "learning_rate": 5.456346369396747e-07, + "loss": 1.578, + "step": 4731 + }, + { + "epoch": 0.3297216318851688, + "grad_norm": 0.7164436063279995, + "learning_rate": 5.455753160643777e-07, + "loss": 1.6629, + "step": 4732 + }, + { + "epoch": 0.32979131101278614, + "grad_norm": 0.7559462022136434, + "learning_rate": 5.455159874939819e-07, + "loss": 1.4736, + "step": 4733 + }, + { + "epoch": 0.32986099014040343, + "grad_norm": 0.7600862407544624, + "learning_rate": 5.454566512313302e-07, + "loss": 1.4862, + "step": 4734 + }, + { + "epoch": 0.3299306692680208, + "grad_norm": 0.7004081297809731, + "learning_rate": 5.453973072792665e-07, + "loss": 1.5029, + "step": 4735 + }, + { + "epoch": 0.3300003483956381, + "grad_norm": 0.72591322289454, + "learning_rate": 5.453379556406344e-07, + "loss": 1.5313, + "step": 4736 + }, + { + "epoch": 0.3300700275232554, + "grad_norm": 0.7659965365810644, + "learning_rate": 5.452785963182786e-07, + "loss": 1.5111, + "step": 4737 + }, + { + "epoch": 0.3301397066508727, + "grad_norm": 0.7075696002326214, + "learning_rate": 5.452192293150432e-07, + "loss": 1.6228, + "step": 4738 + }, + { + "epoch": 0.33020938577849007, + "grad_norm": 0.7345339015777442, + "learning_rate": 5.451598546337734e-07, + "loss": 1.6043, + "step": 4739 + }, + { + "epoch": 0.33027906490610737, + "grad_norm": 0.7963071700771978, + "learning_rate": 5.451004722773148e-07, + "loss": 1.6095, + "step": 4740 + }, + { + "epoch": 0.3303487440337247, + "grad_norm": 0.7525640730979843, + "learning_rate": 5.450410822485126e-07, + "loss": 1.4938, + "step": 4741 + }, + { + "epoch": 0.330418423161342, + "grad_norm": 0.6891761133669421, + "learning_rate": 5.449816845502132e-07, + "loss": 1.4215, + "step": 4742 + }, + { + "epoch": 0.33048810228895936, + "grad_norm": 0.7469893291500803, + "learning_rate": 5.449222791852631e-07, + "loss": 1.5721, + "step": 4743 + }, + { + "epoch": 0.33055778141657666, + "grad_norm": 0.7269189267880387, + "learning_rate": 5.448628661565092e-07, + "loss": 1.5862, + "step": 4744 + }, + { + "epoch": 0.330627460544194, + "grad_norm": 0.7483789383148026, + "learning_rate": 5.448034454667984e-07, + "loss": 1.6084, + "step": 4745 + }, + { + "epoch": 0.3306971396718113, + "grad_norm": 0.7177898667623517, + "learning_rate": 5.447440171189784e-07, + "loss": 1.5663, + "step": 4746 + }, + { + "epoch": 0.33076681879942865, + "grad_norm": 0.7687765020088722, + "learning_rate": 5.446845811158973e-07, + "loss": 1.5396, + "step": 4747 + }, + { + "epoch": 0.33083649792704595, + "grad_norm": 0.7445506289197298, + "learning_rate": 5.446251374604032e-07, + "loss": 1.6004, + "step": 4748 + }, + { + "epoch": 0.3309061770546633, + "grad_norm": 0.7628599743997581, + "learning_rate": 5.445656861553449e-07, + "loss": 1.5991, + "step": 4749 + }, + { + "epoch": 0.3309758561822806, + "grad_norm": 0.7690295768355484, + "learning_rate": 5.44506227203571e-07, + "loss": 1.608, + "step": 4750 + }, + { + "epoch": 0.33104553530989794, + "grad_norm": 0.7066359165688519, + "learning_rate": 5.444467606079316e-07, + "loss": 1.553, + "step": 4751 + }, + { + "epoch": 0.33111521443751524, + "grad_norm": 0.7107118246361575, + "learning_rate": 5.443872863712759e-07, + "loss": 1.445, + "step": 4752 + }, + { + "epoch": 0.3311848935651326, + "grad_norm": 0.7189520145837976, + "learning_rate": 5.443278044964542e-07, + "loss": 1.5686, + "step": 4753 + }, + { + "epoch": 0.3312545726927499, + "grad_norm": 0.6934783281831515, + "learning_rate": 5.442683149863171e-07, + "loss": 1.4495, + "step": 4754 + }, + { + "epoch": 0.33132425182036723, + "grad_norm": 0.7524447895980935, + "learning_rate": 5.442088178437154e-07, + "loss": 1.6211, + "step": 4755 + }, + { + "epoch": 0.3313939309479845, + "grad_norm": 0.7002606597250953, + "learning_rate": 5.441493130715002e-07, + "loss": 1.5382, + "step": 4756 + }, + { + "epoch": 0.3314636100756019, + "grad_norm": 0.7542379695219257, + "learning_rate": 5.440898006725234e-07, + "loss": 1.6374, + "step": 4757 + }, + { + "epoch": 0.3315332892032192, + "grad_norm": 0.7312472818227834, + "learning_rate": 5.440302806496365e-07, + "loss": 1.4442, + "step": 4758 + }, + { + "epoch": 0.3316029683308365, + "grad_norm": 0.6765765400729704, + "learning_rate": 5.439707530056922e-07, + "loss": 1.5379, + "step": 4759 + }, + { + "epoch": 0.3316726474584538, + "grad_norm": 0.7145871013427026, + "learning_rate": 5.43911217743543e-07, + "loss": 1.4743, + "step": 4760 + }, + { + "epoch": 0.33174232658607117, + "grad_norm": 0.7534020290273412, + "learning_rate": 5.438516748660421e-07, + "loss": 1.5155, + "step": 4761 + }, + { + "epoch": 0.33181200571368846, + "grad_norm": 0.701107642439635, + "learning_rate": 5.437921243760427e-07, + "loss": 1.5799, + "step": 4762 + }, + { + "epoch": 0.3318816848413058, + "grad_norm": 0.8062590359838392, + "learning_rate": 5.437325662763987e-07, + "loss": 1.6094, + "step": 4763 + }, + { + "epoch": 0.3319513639689231, + "grad_norm": 0.7264396881824076, + "learning_rate": 5.436730005699644e-07, + "loss": 1.5333, + "step": 4764 + }, + { + "epoch": 0.33202104309654046, + "grad_norm": 0.7164683710272938, + "learning_rate": 5.436134272595941e-07, + "loss": 1.643, + "step": 4765 + }, + { + "epoch": 0.33209072222415775, + "grad_norm": 0.7681061882158766, + "learning_rate": 5.435538463481427e-07, + "loss": 1.7029, + "step": 4766 + }, + { + "epoch": 0.3321604013517751, + "grad_norm": 0.68673542098374, + "learning_rate": 5.434942578384654e-07, + "loss": 1.4273, + "step": 4767 + }, + { + "epoch": 0.3322300804793924, + "grad_norm": 0.7630700252413264, + "learning_rate": 5.43434661733418e-07, + "loss": 1.6314, + "step": 4768 + }, + { + "epoch": 0.3322997596070097, + "grad_norm": 0.7378555507316659, + "learning_rate": 5.433750580358563e-07, + "loss": 1.5883, + "step": 4769 + }, + { + "epoch": 0.33236943873462704, + "grad_norm": 0.7184196608437186, + "learning_rate": 5.433154467486367e-07, + "loss": 1.553, + "step": 4770 + }, + { + "epoch": 0.33243911786224434, + "grad_norm": 0.7610107203023418, + "learning_rate": 5.43255827874616e-07, + "loss": 1.5474, + "step": 4771 + }, + { + "epoch": 0.3325087969898617, + "grad_norm": 0.6578353625178328, + "learning_rate": 5.43196201416651e-07, + "loss": 1.5826, + "step": 4772 + }, + { + "epoch": 0.332578476117479, + "grad_norm": 0.7332684613958692, + "learning_rate": 5.43136567377599e-07, + "loss": 1.5332, + "step": 4773 + }, + { + "epoch": 0.33264815524509633, + "grad_norm": 0.7403565072313969, + "learning_rate": 5.430769257603185e-07, + "loss": 1.6311, + "step": 4774 + }, + { + "epoch": 0.33271783437271363, + "grad_norm": 0.7207657489954281, + "learning_rate": 5.43017276567667e-07, + "loss": 1.6177, + "step": 4775 + }, + { + "epoch": 0.332787513500331, + "grad_norm": 0.7303985385444817, + "learning_rate": 5.429576198025032e-07, + "loss": 1.5329, + "step": 4776 + }, + { + "epoch": 0.3328571926279483, + "grad_norm": 0.7035495502689031, + "learning_rate": 5.428979554676861e-07, + "loss": 1.4916, + "step": 4777 + }, + { + "epoch": 0.3329268717555656, + "grad_norm": 0.7003566219294264, + "learning_rate": 5.428382835660746e-07, + "loss": 1.5497, + "step": 4778 + }, + { + "epoch": 0.3329965508831829, + "grad_norm": 0.7612676386996001, + "learning_rate": 5.427786041005286e-07, + "loss": 1.5206, + "step": 4779 + }, + { + "epoch": 0.33306623001080027, + "grad_norm": 0.7094601748268057, + "learning_rate": 5.42718917073908e-07, + "loss": 1.5688, + "step": 4780 + }, + { + "epoch": 0.33313590913841756, + "grad_norm": 0.728996130018037, + "learning_rate": 5.426592224890731e-07, + "loss": 1.5695, + "step": 4781 + }, + { + "epoch": 0.3332055882660349, + "grad_norm": 0.6826206661442296, + "learning_rate": 5.425995203488846e-07, + "loss": 1.5038, + "step": 4782 + }, + { + "epoch": 0.3332752673936522, + "grad_norm": 0.6850069680455925, + "learning_rate": 5.425398106562034e-07, + "loss": 1.4939, + "step": 4783 + }, + { + "epoch": 0.33334494652126956, + "grad_norm": 0.7198182130651017, + "learning_rate": 5.424800934138913e-07, + "loss": 1.6911, + "step": 4784 + }, + { + "epoch": 0.33341462564888685, + "grad_norm": 0.7031776478007508, + "learning_rate": 5.424203686248098e-07, + "loss": 1.3148, + "step": 4785 + }, + { + "epoch": 0.3334843047765042, + "grad_norm": 0.7033765004420741, + "learning_rate": 5.423606362918209e-07, + "loss": 1.5843, + "step": 4786 + }, + { + "epoch": 0.3335539839041215, + "grad_norm": 0.7277452680681038, + "learning_rate": 5.423008964177873e-07, + "loss": 1.6939, + "step": 4787 + }, + { + "epoch": 0.33362366303173885, + "grad_norm": 0.721356540917203, + "learning_rate": 5.422411490055717e-07, + "loss": 1.5653, + "step": 4788 + }, + { + "epoch": 0.33369334215935614, + "grad_norm": 0.7036008560433303, + "learning_rate": 5.421813940580377e-07, + "loss": 1.6107, + "step": 4789 + }, + { + "epoch": 0.3337630212869735, + "grad_norm": 0.7458747498855014, + "learning_rate": 5.421216315780484e-07, + "loss": 1.6373, + "step": 4790 + }, + { + "epoch": 0.3338327004145908, + "grad_norm": 0.7173976160596333, + "learning_rate": 5.420618615684681e-07, + "loss": 1.5497, + "step": 4791 + }, + { + "epoch": 0.33390237954220814, + "grad_norm": 0.7257908124277479, + "learning_rate": 5.420020840321608e-07, + "loss": 1.6506, + "step": 4792 + }, + { + "epoch": 0.33397205866982543, + "grad_norm": 0.7276705606366335, + "learning_rate": 5.419422989719914e-07, + "loss": 1.5947, + "step": 4793 + }, + { + "epoch": 0.3340417377974428, + "grad_norm": 0.7124819335777425, + "learning_rate": 5.418825063908247e-07, + "loss": 1.5129, + "step": 4794 + }, + { + "epoch": 0.3341114169250601, + "grad_norm": 0.6973185583977857, + "learning_rate": 5.418227062915263e-07, + "loss": 1.4769, + "step": 4795 + }, + { + "epoch": 0.33418109605267743, + "grad_norm": 0.7171103000694339, + "learning_rate": 5.417628986769621e-07, + "loss": 1.528, + "step": 4796 + }, + { + "epoch": 0.3342507751802947, + "grad_norm": 0.7212360242606413, + "learning_rate": 5.417030835499978e-07, + "loss": 1.4938, + "step": 4797 + }, + { + "epoch": 0.3343204543079121, + "grad_norm": 0.7517464170152247, + "learning_rate": 5.416432609135e-07, + "loss": 1.609, + "step": 4798 + }, + { + "epoch": 0.33439013343552937, + "grad_norm": 0.8652168233618581, + "learning_rate": 5.415834307703356e-07, + "loss": 1.5062, + "step": 4799 + }, + { + "epoch": 0.3344598125631467, + "grad_norm": 0.7218739889842337, + "learning_rate": 5.415235931233716e-07, + "loss": 1.4711, + "step": 4800 + }, + { + "epoch": 0.334529491690764, + "grad_norm": 0.7094515975723757, + "learning_rate": 5.414637479754757e-07, + "loss": 1.5788, + "step": 4801 + }, + { + "epoch": 0.33459917081838136, + "grad_norm": 0.7393768229184177, + "learning_rate": 5.414038953295158e-07, + "loss": 1.5629, + "step": 4802 + }, + { + "epoch": 0.33466884994599866, + "grad_norm": 0.7302062209177451, + "learning_rate": 5.413440351883602e-07, + "loss": 1.6523, + "step": 4803 + }, + { + "epoch": 0.334738529073616, + "grad_norm": 0.6845338065322232, + "learning_rate": 5.412841675548776e-07, + "loss": 1.522, + "step": 4804 + }, + { + "epoch": 0.3348082082012333, + "grad_norm": 0.7235354663832575, + "learning_rate": 5.412242924319366e-07, + "loss": 1.5268, + "step": 4805 + }, + { + "epoch": 0.33487788732885065, + "grad_norm": 0.7613419830759305, + "learning_rate": 5.411644098224069e-07, + "loss": 1.4734, + "step": 4806 + }, + { + "epoch": 0.33494756645646795, + "grad_norm": 0.6741423643970372, + "learning_rate": 5.411045197291581e-07, + "loss": 1.5251, + "step": 4807 + }, + { + "epoch": 0.3350172455840853, + "grad_norm": 0.7105937235840101, + "learning_rate": 5.410446221550603e-07, + "loss": 1.5714, + "step": 4808 + }, + { + "epoch": 0.3350869247117026, + "grad_norm": 0.7391867622954359, + "learning_rate": 5.409847171029837e-07, + "loss": 1.5494, + "step": 4809 + }, + { + "epoch": 0.33515660383931994, + "grad_norm": 0.697675218773251, + "learning_rate": 5.409248045757993e-07, + "loss": 1.5154, + "step": 4810 + }, + { + "epoch": 0.33522628296693724, + "grad_norm": 0.6927014456831924, + "learning_rate": 5.408648845763781e-07, + "loss": 1.5613, + "step": 4811 + }, + { + "epoch": 0.3352959620945546, + "grad_norm": 0.7007622265131535, + "learning_rate": 5.408049571075917e-07, + "loss": 1.515, + "step": 4812 + }, + { + "epoch": 0.3353656412221719, + "grad_norm": 0.6629006984199853, + "learning_rate": 5.40745022172312e-07, + "loss": 1.4397, + "step": 4813 + }, + { + "epoch": 0.33543532034978923, + "grad_norm": 0.7762079077753187, + "learning_rate": 5.406850797734109e-07, + "loss": 1.7242, + "step": 4814 + }, + { + "epoch": 0.33550499947740653, + "grad_norm": 0.705869108994762, + "learning_rate": 5.406251299137613e-07, + "loss": 1.5714, + "step": 4815 + }, + { + "epoch": 0.3355746786050239, + "grad_norm": 0.6860445057284045, + "learning_rate": 5.405651725962358e-07, + "loss": 1.5411, + "step": 4816 + }, + { + "epoch": 0.3356443577326412, + "grad_norm": 0.6967122024785454, + "learning_rate": 5.405052078237082e-07, + "loss": 1.6579, + "step": 4817 + }, + { + "epoch": 0.3357140368602585, + "grad_norm": 0.7647204058296255, + "learning_rate": 5.404452355990515e-07, + "loss": 1.5023, + "step": 4818 + }, + { + "epoch": 0.3357837159878758, + "grad_norm": 0.7108929741768184, + "learning_rate": 5.403852559251401e-07, + "loss": 1.5517, + "step": 4819 + }, + { + "epoch": 0.33585339511549317, + "grad_norm": 0.8140504410329782, + "learning_rate": 5.403252688048482e-07, + "loss": 1.5006, + "step": 4820 + }, + { + "epoch": 0.33592307424311046, + "grad_norm": 0.6860079696803548, + "learning_rate": 5.402652742410505e-07, + "loss": 1.6411, + "step": 4821 + }, + { + "epoch": 0.3359927533707278, + "grad_norm": 0.7650567940545888, + "learning_rate": 5.402052722366221e-07, + "loss": 1.3765, + "step": 4822 + }, + { + "epoch": 0.3360624324983451, + "grad_norm": 0.7242517343750577, + "learning_rate": 5.401452627944387e-07, + "loss": 1.4242, + "step": 4823 + }, + { + "epoch": 0.33613211162596246, + "grad_norm": 0.7431924617453302, + "learning_rate": 5.400852459173754e-07, + "loss": 1.586, + "step": 4824 + }, + { + "epoch": 0.33620179075357975, + "grad_norm": 0.698467639711488, + "learning_rate": 5.40025221608309e-07, + "loss": 1.5904, + "step": 4825 + }, + { + "epoch": 0.3362714698811971, + "grad_norm": 0.7824488496164661, + "learning_rate": 5.399651898701156e-07, + "loss": 1.4601, + "step": 4826 + }, + { + "epoch": 0.3363411490088144, + "grad_norm": 0.7380126909494652, + "learning_rate": 5.399051507056722e-07, + "loss": 1.421, + "step": 4827 + }, + { + "epoch": 0.33641082813643175, + "grad_norm": 0.7554036049745274, + "learning_rate": 5.39845104117856e-07, + "loss": 1.5047, + "step": 4828 + }, + { + "epoch": 0.33648050726404904, + "grad_norm": 0.6730427825408142, + "learning_rate": 5.397850501095445e-07, + "loss": 1.5169, + "step": 4829 + }, + { + "epoch": 0.3365501863916664, + "grad_norm": 0.7522968508915672, + "learning_rate": 5.397249886836155e-07, + "loss": 1.579, + "step": 4830 + }, + { + "epoch": 0.3366198655192837, + "grad_norm": 0.7656384123478277, + "learning_rate": 5.396649198429476e-07, + "loss": 1.5836, + "step": 4831 + }, + { + "epoch": 0.33668954464690104, + "grad_norm": 0.7095621969140741, + "learning_rate": 5.396048435904192e-07, + "loss": 1.5901, + "step": 4832 + }, + { + "epoch": 0.33675922377451833, + "grad_norm": 0.6984903216291455, + "learning_rate": 5.395447599289092e-07, + "loss": 1.5424, + "step": 4833 + }, + { + "epoch": 0.3368289029021357, + "grad_norm": 0.7296764226484795, + "learning_rate": 5.394846688612969e-07, + "loss": 1.5648, + "step": 4834 + }, + { + "epoch": 0.336898582029753, + "grad_norm": 0.6635513910837075, + "learning_rate": 5.394245703904623e-07, + "loss": 1.4931, + "step": 4835 + }, + { + "epoch": 0.33696826115737033, + "grad_norm": 0.7388466868414264, + "learning_rate": 5.393644645192853e-07, + "loss": 1.4121, + "step": 4836 + }, + { + "epoch": 0.3370379402849876, + "grad_norm": 0.7112377558521906, + "learning_rate": 5.393043512506462e-07, + "loss": 1.4546, + "step": 4837 + }, + { + "epoch": 0.337107619412605, + "grad_norm": 0.727315967629537, + "learning_rate": 5.392442305874258e-07, + "loss": 1.6098, + "step": 4838 + }, + { + "epoch": 0.33717729854022227, + "grad_norm": 0.8099460689341998, + "learning_rate": 5.391841025325051e-07, + "loss": 1.6303, + "step": 4839 + }, + { + "epoch": 0.3372469776678396, + "grad_norm": 0.7051935142489242, + "learning_rate": 5.391239670887659e-07, + "loss": 1.4497, + "step": 4840 + }, + { + "epoch": 0.3373166567954569, + "grad_norm": 0.6783397479396385, + "learning_rate": 5.390638242590897e-07, + "loss": 1.5105, + "step": 4841 + }, + { + "epoch": 0.33738633592307427, + "grad_norm": 0.7347089211759783, + "learning_rate": 5.390036740463587e-07, + "loss": 1.6473, + "step": 4842 + }, + { + "epoch": 0.33745601505069156, + "grad_norm": 0.6732402289305454, + "learning_rate": 5.389435164534555e-07, + "loss": 1.4919, + "step": 4843 + }, + { + "epoch": 0.3375256941783089, + "grad_norm": 0.7148083262660647, + "learning_rate": 5.38883351483263e-07, + "loss": 1.5546, + "step": 4844 + }, + { + "epoch": 0.3375953733059262, + "grad_norm": 0.6836003305145799, + "learning_rate": 5.388231791386643e-07, + "loss": 1.4778, + "step": 4845 + }, + { + "epoch": 0.33766505243354356, + "grad_norm": 0.8084013190577042, + "learning_rate": 5.387629994225432e-07, + "loss": 1.6094, + "step": 4846 + }, + { + "epoch": 0.33773473156116085, + "grad_norm": 0.662801305334541, + "learning_rate": 5.387028123377832e-07, + "loss": 1.4713, + "step": 4847 + }, + { + "epoch": 0.3378044106887782, + "grad_norm": 0.6791407537391915, + "learning_rate": 5.386426178872692e-07, + "loss": 1.5404, + "step": 4848 + }, + { + "epoch": 0.3378740898163955, + "grad_norm": 0.7335026978467318, + "learning_rate": 5.385824160738854e-07, + "loss": 1.5911, + "step": 4849 + }, + { + "epoch": 0.33794376894401285, + "grad_norm": 0.6875140299288989, + "learning_rate": 5.385222069005169e-07, + "loss": 1.5747, + "step": 4850 + }, + { + "epoch": 0.33801344807163014, + "grad_norm": 0.6989430758131691, + "learning_rate": 5.384619903700492e-07, + "loss": 1.4246, + "step": 4851 + }, + { + "epoch": 0.3380831271992475, + "grad_norm": 0.6590948742130245, + "learning_rate": 5.384017664853677e-07, + "loss": 1.5146, + "step": 4852 + }, + { + "epoch": 0.3381528063268648, + "grad_norm": 0.706352782049481, + "learning_rate": 5.383415352493587e-07, + "loss": 1.571, + "step": 4853 + }, + { + "epoch": 0.33822248545448214, + "grad_norm": 0.7519551620740152, + "learning_rate": 5.382812966649086e-07, + "loss": 1.5446, + "step": 4854 + }, + { + "epoch": 0.33829216458209943, + "grad_norm": 0.7099021233898958, + "learning_rate": 5.38221050734904e-07, + "loss": 1.5529, + "step": 4855 + }, + { + "epoch": 0.3383618437097168, + "grad_norm": 0.7199442639569562, + "learning_rate": 5.38160797462232e-07, + "loss": 1.534, + "step": 4856 + }, + { + "epoch": 0.3384315228373341, + "grad_norm": 0.7144789348289255, + "learning_rate": 5.381005368497803e-07, + "loss": 1.4376, + "step": 4857 + }, + { + "epoch": 0.3385012019649514, + "grad_norm": 0.7403548398371584, + "learning_rate": 5.380402689004365e-07, + "loss": 1.5743, + "step": 4858 + }, + { + "epoch": 0.3385708810925687, + "grad_norm": 0.7627039850404922, + "learning_rate": 5.379799936170888e-07, + "loss": 1.5054, + "step": 4859 + }, + { + "epoch": 0.33864056022018607, + "grad_norm": 0.677840868673094, + "learning_rate": 5.379197110026258e-07, + "loss": 1.5203, + "step": 4860 + }, + { + "epoch": 0.33871023934780337, + "grad_norm": 0.7394880148402937, + "learning_rate": 5.378594210599363e-07, + "loss": 1.61, + "step": 4861 + }, + { + "epoch": 0.33877991847542066, + "grad_norm": 0.7543058613191993, + "learning_rate": 5.377991237919096e-07, + "loss": 1.5824, + "step": 4862 + }, + { + "epoch": 0.338849597603038, + "grad_norm": 0.665265338341848, + "learning_rate": 5.377388192014351e-07, + "loss": 1.4639, + "step": 4863 + }, + { + "epoch": 0.3389192767306553, + "grad_norm": 0.711462501878098, + "learning_rate": 5.376785072914029e-07, + "loss": 1.5846, + "step": 4864 + }, + { + "epoch": 0.33898895585827266, + "grad_norm": 0.7270116627711709, + "learning_rate": 5.37618188064703e-07, + "loss": 1.5409, + "step": 4865 + }, + { + "epoch": 0.33905863498588995, + "grad_norm": 0.6769787698247857, + "learning_rate": 5.375578615242263e-07, + "loss": 1.5406, + "step": 4866 + }, + { + "epoch": 0.3391283141135073, + "grad_norm": 0.7193369604108305, + "learning_rate": 5.374975276728638e-07, + "loss": 1.5786, + "step": 4867 + }, + { + "epoch": 0.3391979932411246, + "grad_norm": 0.7083441966840394, + "learning_rate": 5.374371865135067e-07, + "loss": 1.4636, + "step": 4868 + }, + { + "epoch": 0.33926767236874195, + "grad_norm": 0.6917558146241541, + "learning_rate": 5.373768380490466e-07, + "loss": 1.4435, + "step": 4869 + }, + { + "epoch": 0.33933735149635924, + "grad_norm": 0.714922470007537, + "learning_rate": 5.373164822823755e-07, + "loss": 1.4841, + "step": 4870 + }, + { + "epoch": 0.3394070306239766, + "grad_norm": 0.717829779405514, + "learning_rate": 5.37256119216386e-07, + "loss": 1.5644, + "step": 4871 + }, + { + "epoch": 0.3394767097515939, + "grad_norm": 0.7743959381513768, + "learning_rate": 5.371957488539706e-07, + "loss": 1.5185, + "step": 4872 + }, + { + "epoch": 0.33954638887921124, + "grad_norm": 0.8131617550648077, + "learning_rate": 5.371353711980225e-07, + "loss": 1.5316, + "step": 4873 + }, + { + "epoch": 0.33961606800682853, + "grad_norm": 0.7709352415864421, + "learning_rate": 5.370749862514352e-07, + "loss": 1.5742, + "step": 4874 + }, + { + "epoch": 0.3396857471344459, + "grad_norm": 0.7807457832872728, + "learning_rate": 5.370145940171022e-07, + "loss": 1.6104, + "step": 4875 + }, + { + "epoch": 0.3397554262620632, + "grad_norm": 0.7267607315291826, + "learning_rate": 5.369541944979178e-07, + "loss": 1.4927, + "step": 4876 + }, + { + "epoch": 0.3398251053896805, + "grad_norm": 0.7361558349717697, + "learning_rate": 5.368937876967765e-07, + "loss": 1.4753, + "step": 4877 + }, + { + "epoch": 0.3398947845172978, + "grad_norm": 0.6852432333809807, + "learning_rate": 5.36833373616573e-07, + "loss": 1.5538, + "step": 4878 + }, + { + "epoch": 0.33996446364491517, + "grad_norm": 0.7199238314597366, + "learning_rate": 5.367729522602026e-07, + "loss": 1.475, + "step": 4879 + }, + { + "epoch": 0.34003414277253247, + "grad_norm": 0.7732480003051626, + "learning_rate": 5.367125236305607e-07, + "loss": 1.6253, + "step": 4880 + }, + { + "epoch": 0.3401038219001498, + "grad_norm": 0.690556485623902, + "learning_rate": 5.366520877305433e-07, + "loss": 1.6229, + "step": 4881 + }, + { + "epoch": 0.3401735010277671, + "grad_norm": 0.7113155930404748, + "learning_rate": 5.365916445630464e-07, + "loss": 1.5381, + "step": 4882 + }, + { + "epoch": 0.34024318015538446, + "grad_norm": 0.6147503521616917, + "learning_rate": 5.365311941309667e-07, + "loss": 1.4189, + "step": 4883 + }, + { + "epoch": 0.34031285928300176, + "grad_norm": 0.714706834724729, + "learning_rate": 5.36470736437201e-07, + "loss": 1.5128, + "step": 4884 + }, + { + "epoch": 0.3403825384106191, + "grad_norm": 0.7150201217564616, + "learning_rate": 5.364102714846469e-07, + "loss": 1.5691, + "step": 4885 + }, + { + "epoch": 0.3404522175382364, + "grad_norm": 0.7304098295016737, + "learning_rate": 5.363497992762015e-07, + "loss": 1.432, + "step": 4886 + }, + { + "epoch": 0.34052189666585375, + "grad_norm": 0.7570206498289689, + "learning_rate": 5.362893198147631e-07, + "loss": 1.728, + "step": 4887 + }, + { + "epoch": 0.34059157579347105, + "grad_norm": 0.7143465756664162, + "learning_rate": 5.3622883310323e-07, + "loss": 1.5685, + "step": 4888 + }, + { + "epoch": 0.3406612549210884, + "grad_norm": 0.8122541734007522, + "learning_rate": 5.361683391445006e-07, + "loss": 1.5325, + "step": 4889 + }, + { + "epoch": 0.3407309340487057, + "grad_norm": 0.7734092568022763, + "learning_rate": 5.361078379414741e-07, + "loss": 1.4959, + "step": 4890 + }, + { + "epoch": 0.34080061317632304, + "grad_norm": 0.7251845358695838, + "learning_rate": 5.360473294970499e-07, + "loss": 1.5793, + "step": 4891 + }, + { + "epoch": 0.34087029230394034, + "grad_norm": 0.7678069911342846, + "learning_rate": 5.359868138141274e-07, + "loss": 1.4537, + "step": 4892 + }, + { + "epoch": 0.3409399714315577, + "grad_norm": 0.723545658098318, + "learning_rate": 5.359262908956068e-07, + "loss": 1.4833, + "step": 4893 + }, + { + "epoch": 0.341009650559175, + "grad_norm": 0.710062730643498, + "learning_rate": 5.358657607443887e-07, + "loss": 1.5221, + "step": 4894 + }, + { + "epoch": 0.34107932968679233, + "grad_norm": 0.7061472317372298, + "learning_rate": 5.358052233633734e-07, + "loss": 1.5833, + "step": 4895 + }, + { + "epoch": 0.3411490088144096, + "grad_norm": 0.7442311521043259, + "learning_rate": 5.357446787554623e-07, + "loss": 1.5627, + "step": 4896 + }, + { + "epoch": 0.341218687942027, + "grad_norm": 0.6977710712490137, + "learning_rate": 5.356841269235568e-07, + "loss": 1.5103, + "step": 4897 + }, + { + "epoch": 0.34128836706964427, + "grad_norm": 0.6829996598763791, + "learning_rate": 5.356235678705584e-07, + "loss": 1.539, + "step": 4898 + }, + { + "epoch": 0.3413580461972616, + "grad_norm": 0.7041108868124177, + "learning_rate": 5.355630015993696e-07, + "loss": 1.4318, + "step": 4899 + }, + { + "epoch": 0.3414277253248789, + "grad_norm": 0.739511445992145, + "learning_rate": 5.355024281128926e-07, + "loss": 1.6007, + "step": 4900 + }, + { + "epoch": 0.34149740445249627, + "grad_norm": 0.752034002439808, + "learning_rate": 5.354418474140302e-07, + "loss": 1.6381, + "step": 4901 + }, + { + "epoch": 0.34156708358011356, + "grad_norm": 0.7260501221496931, + "learning_rate": 5.353812595056856e-07, + "loss": 1.6835, + "step": 4902 + }, + { + "epoch": 0.3416367627077309, + "grad_norm": 0.6857671150292761, + "learning_rate": 5.353206643907624e-07, + "loss": 1.6081, + "step": 4903 + }, + { + "epoch": 0.3417064418353482, + "grad_norm": 0.7102916293714224, + "learning_rate": 5.352600620721644e-07, + "loss": 1.4413, + "step": 4904 + }, + { + "epoch": 0.34177612096296556, + "grad_norm": 0.6528079878598496, + "learning_rate": 5.351994525527957e-07, + "loss": 1.5078, + "step": 4905 + }, + { + "epoch": 0.34184580009058285, + "grad_norm": 0.765193785606818, + "learning_rate": 5.351388358355609e-07, + "loss": 1.4708, + "step": 4906 + }, + { + "epoch": 0.3419154792182002, + "grad_norm": 0.7233681669435182, + "learning_rate": 5.350782119233648e-07, + "loss": 1.5652, + "step": 4907 + }, + { + "epoch": 0.3419851583458175, + "grad_norm": 0.6973226611884625, + "learning_rate": 5.350175808191127e-07, + "loss": 1.5033, + "step": 4908 + }, + { + "epoch": 0.34205483747343485, + "grad_norm": 0.7136416717450348, + "learning_rate": 5.349569425257101e-07, + "loss": 1.4198, + "step": 4909 + }, + { + "epoch": 0.34212451660105214, + "grad_norm": 0.7693775797382657, + "learning_rate": 5.348962970460631e-07, + "loss": 1.409, + "step": 4910 + }, + { + "epoch": 0.3421941957286695, + "grad_norm": 0.7336409853595478, + "learning_rate": 5.348356443830777e-07, + "loss": 1.4816, + "step": 4911 + }, + { + "epoch": 0.3422638748562868, + "grad_norm": 0.6823577987189021, + "learning_rate": 5.347749845396606e-07, + "loss": 1.5111, + "step": 4912 + }, + { + "epoch": 0.34233355398390414, + "grad_norm": 0.7167651243573857, + "learning_rate": 5.347143175187188e-07, + "loss": 1.3348, + "step": 4913 + }, + { + "epoch": 0.34240323311152143, + "grad_norm": 0.7398765728960008, + "learning_rate": 5.346536433231596e-07, + "loss": 1.465, + "step": 4914 + }, + { + "epoch": 0.3424729122391388, + "grad_norm": 0.7291279264814856, + "learning_rate": 5.345929619558905e-07, + "loss": 1.4537, + "step": 4915 + }, + { + "epoch": 0.3425425913667561, + "grad_norm": 0.6960763279406099, + "learning_rate": 5.345322734198196e-07, + "loss": 1.5036, + "step": 4916 + }, + { + "epoch": 0.3426122704943734, + "grad_norm": 0.7419002594743389, + "learning_rate": 5.344715777178551e-07, + "loss": 1.6412, + "step": 4917 + }, + { + "epoch": 0.3426819496219907, + "grad_norm": 0.7595820712509183, + "learning_rate": 5.344108748529058e-07, + "loss": 1.6397, + "step": 4918 + }, + { + "epoch": 0.3427516287496081, + "grad_norm": 0.7694766737856703, + "learning_rate": 5.343501648278807e-07, + "loss": 1.6078, + "step": 4919 + }, + { + "epoch": 0.34282130787722537, + "grad_norm": 0.6807981058044563, + "learning_rate": 5.342894476456889e-07, + "loss": 1.4479, + "step": 4920 + }, + { + "epoch": 0.3428909870048427, + "grad_norm": 0.754974484289456, + "learning_rate": 5.342287233092405e-07, + "loss": 1.6514, + "step": 4921 + }, + { + "epoch": 0.34296066613246, + "grad_norm": 0.7535510628217905, + "learning_rate": 5.341679918214452e-07, + "loss": 1.5696, + "step": 4922 + }, + { + "epoch": 0.34303034526007736, + "grad_norm": 0.7357029495555406, + "learning_rate": 5.341072531852134e-07, + "loss": 1.521, + "step": 4923 + }, + { + "epoch": 0.34310002438769466, + "grad_norm": 0.7082023447167447, + "learning_rate": 5.340465074034562e-07, + "loss": 1.5007, + "step": 4924 + }, + { + "epoch": 0.343169703515312, + "grad_norm": 0.7097395892281676, + "learning_rate": 5.339857544790843e-07, + "loss": 1.4334, + "step": 4925 + }, + { + "epoch": 0.3432393826429293, + "grad_norm": 0.7437771128644068, + "learning_rate": 5.33924994415009e-07, + "loss": 1.6869, + "step": 4926 + }, + { + "epoch": 0.34330906177054665, + "grad_norm": 0.7222212332837278, + "learning_rate": 5.338642272141424e-07, + "loss": 1.6559, + "step": 4927 + }, + { + "epoch": 0.34337874089816395, + "grad_norm": 0.774011346047693, + "learning_rate": 5.338034528793963e-07, + "loss": 1.6342, + "step": 4928 + }, + { + "epoch": 0.3434484200257813, + "grad_norm": 0.6966346592207932, + "learning_rate": 5.337426714136832e-07, + "loss": 1.4734, + "step": 4929 + }, + { + "epoch": 0.3435180991533986, + "grad_norm": 0.788871599051374, + "learning_rate": 5.33681882819916e-07, + "loss": 1.5862, + "step": 4930 + }, + { + "epoch": 0.34358777828101594, + "grad_norm": 0.7397766374316244, + "learning_rate": 5.336210871010078e-07, + "loss": 1.6653, + "step": 4931 + }, + { + "epoch": 0.34365745740863324, + "grad_norm": 0.7180829427756388, + "learning_rate": 5.335602842598721e-07, + "loss": 1.5283, + "step": 4932 + }, + { + "epoch": 0.3437271365362506, + "grad_norm": 0.7348660111880068, + "learning_rate": 5.334994742994224e-07, + "loss": 1.618, + "step": 4933 + }, + { + "epoch": 0.3437968156638679, + "grad_norm": 0.7752690830193244, + "learning_rate": 5.33438657222573e-07, + "loss": 1.6631, + "step": 4934 + }, + { + "epoch": 0.34386649479148523, + "grad_norm": 0.7061573790825465, + "learning_rate": 5.333778330322386e-07, + "loss": 1.5554, + "step": 4935 + }, + { + "epoch": 0.34393617391910253, + "grad_norm": 0.6966452925169359, + "learning_rate": 5.333170017313336e-07, + "loss": 1.4936, + "step": 4936 + }, + { + "epoch": 0.3440058530467199, + "grad_norm": 0.7108489950765159, + "learning_rate": 5.332561633227736e-07, + "loss": 1.49, + "step": 4937 + }, + { + "epoch": 0.3440755321743372, + "grad_norm": 0.7360768060995024, + "learning_rate": 5.331953178094737e-07, + "loss": 1.5874, + "step": 4938 + }, + { + "epoch": 0.3441452113019545, + "grad_norm": 0.7061287583659489, + "learning_rate": 5.3313446519435e-07, + "loss": 1.5182, + "step": 4939 + }, + { + "epoch": 0.3442148904295718, + "grad_norm": 0.7531667234468807, + "learning_rate": 5.330736054803186e-07, + "loss": 1.5243, + "step": 4940 + }, + { + "epoch": 0.34428456955718917, + "grad_norm": 0.649115378193036, + "learning_rate": 5.330127386702962e-07, + "loss": 1.4874, + "step": 4941 + }, + { + "epoch": 0.34435424868480646, + "grad_norm": 0.6839245166901333, + "learning_rate": 5.329518647671992e-07, + "loss": 1.4666, + "step": 4942 + }, + { + "epoch": 0.3444239278124238, + "grad_norm": 0.6955071209132906, + "learning_rate": 5.328909837739454e-07, + "loss": 1.6033, + "step": 4943 + }, + { + "epoch": 0.3444936069400411, + "grad_norm": 0.7160416982988688, + "learning_rate": 5.328300956934519e-07, + "loss": 1.4956, + "step": 4944 + }, + { + "epoch": 0.34456328606765846, + "grad_norm": 0.6908646469509884, + "learning_rate": 5.327692005286366e-07, + "loss": 1.5294, + "step": 4945 + }, + { + "epoch": 0.34463296519527575, + "grad_norm": 0.7472362365311507, + "learning_rate": 5.327082982824181e-07, + "loss": 1.5895, + "step": 4946 + }, + { + "epoch": 0.3447026443228931, + "grad_norm": 0.7505746601450485, + "learning_rate": 5.326473889577145e-07, + "loss": 1.4966, + "step": 4947 + }, + { + "epoch": 0.3447723234505104, + "grad_norm": 0.6600104860006836, + "learning_rate": 5.325864725574451e-07, + "loss": 1.4753, + "step": 4948 + }, + { + "epoch": 0.34484200257812775, + "grad_norm": 0.7586810928023145, + "learning_rate": 5.325255490845287e-07, + "loss": 1.5685, + "step": 4949 + }, + { + "epoch": 0.34491168170574504, + "grad_norm": 0.7852011275224828, + "learning_rate": 5.324646185418853e-07, + "loss": 1.6254, + "step": 4950 + }, + { + "epoch": 0.3449813608333624, + "grad_norm": 0.7464220138774134, + "learning_rate": 5.324036809324347e-07, + "loss": 1.6366, + "step": 4951 + }, + { + "epoch": 0.3450510399609797, + "grad_norm": 0.707679820201954, + "learning_rate": 5.32342736259097e-07, + "loss": 1.5273, + "step": 4952 + }, + { + "epoch": 0.345120719088597, + "grad_norm": 0.6649346495566145, + "learning_rate": 5.322817845247929e-07, + "loss": 1.5056, + "step": 4953 + }, + { + "epoch": 0.34519039821621433, + "grad_norm": 0.7446126608675369, + "learning_rate": 5.322208257324433e-07, + "loss": 1.5291, + "step": 4954 + }, + { + "epoch": 0.34526007734383163, + "grad_norm": 0.7157920582125764, + "learning_rate": 5.321598598849695e-07, + "loss": 1.5463, + "step": 4955 + }, + { + "epoch": 0.345329756471449, + "grad_norm": 0.7130418175302066, + "learning_rate": 5.32098886985293e-07, + "loss": 1.5481, + "step": 4956 + }, + { + "epoch": 0.3453994355990663, + "grad_norm": 0.6549977386565085, + "learning_rate": 5.320379070363359e-07, + "loss": 1.38, + "step": 4957 + }, + { + "epoch": 0.3454691147266836, + "grad_norm": 0.6719591171989239, + "learning_rate": 5.319769200410205e-07, + "loss": 1.5743, + "step": 4958 + }, + { + "epoch": 0.3455387938543009, + "grad_norm": 0.7086929840190598, + "learning_rate": 5.319159260022694e-07, + "loss": 1.5321, + "step": 4959 + }, + { + "epoch": 0.34560847298191827, + "grad_norm": 0.7158822266473911, + "learning_rate": 5.318549249230055e-07, + "loss": 1.558, + "step": 4960 + }, + { + "epoch": 0.34567815210953556, + "grad_norm": 0.7338554111996759, + "learning_rate": 5.317939168061521e-07, + "loss": 1.6296, + "step": 4961 + }, + { + "epoch": 0.3457478312371529, + "grad_norm": 0.716688713173419, + "learning_rate": 5.317329016546326e-07, + "loss": 1.5146, + "step": 4962 + }, + { + "epoch": 0.3458175103647702, + "grad_norm": 0.6746045493486954, + "learning_rate": 5.316718794713716e-07, + "loss": 1.4125, + "step": 4963 + }, + { + "epoch": 0.34588718949238756, + "grad_norm": 0.7673016809121121, + "learning_rate": 5.316108502592928e-07, + "loss": 1.4457, + "step": 4964 + }, + { + "epoch": 0.34595686862000485, + "grad_norm": 0.6981807378933002, + "learning_rate": 5.315498140213211e-07, + "loss": 1.4776, + "step": 4965 + }, + { + "epoch": 0.3460265477476222, + "grad_norm": 0.7111768030048936, + "learning_rate": 5.314887707603814e-07, + "loss": 1.4599, + "step": 4966 + }, + { + "epoch": 0.3460962268752395, + "grad_norm": 0.7897503701287117, + "learning_rate": 5.314277204793992e-07, + "loss": 1.6354, + "step": 4967 + }, + { + "epoch": 0.34616590600285685, + "grad_norm": 0.7191071514005091, + "learning_rate": 5.313666631813e-07, + "loss": 1.5362, + "step": 4968 + }, + { + "epoch": 0.34623558513047414, + "grad_norm": 0.7216111690405148, + "learning_rate": 5.313055988690098e-07, + "loss": 1.4787, + "step": 4969 + }, + { + "epoch": 0.3463052642580915, + "grad_norm": 0.6989929154312301, + "learning_rate": 5.312445275454549e-07, + "loss": 1.5059, + "step": 4970 + }, + { + "epoch": 0.3463749433857088, + "grad_norm": 0.7180325961762156, + "learning_rate": 5.31183449213562e-07, + "loss": 1.5621, + "step": 4971 + }, + { + "epoch": 0.34644462251332614, + "grad_norm": 0.6972183647561123, + "learning_rate": 5.311223638762581e-07, + "loss": 1.5391, + "step": 4972 + }, + { + "epoch": 0.34651430164094343, + "grad_norm": 0.7371303123998308, + "learning_rate": 5.310612715364705e-07, + "loss": 1.4495, + "step": 4973 + }, + { + "epoch": 0.3465839807685608, + "grad_norm": 0.7235927075509028, + "learning_rate": 5.31000172197127e-07, + "loss": 1.66, + "step": 4974 + }, + { + "epoch": 0.3466536598961781, + "grad_norm": 0.6811158833110971, + "learning_rate": 5.309390658611555e-07, + "loss": 1.5143, + "step": 4975 + }, + { + "epoch": 0.34672333902379543, + "grad_norm": 0.7297291177884462, + "learning_rate": 5.308779525314844e-07, + "loss": 1.6434, + "step": 4976 + }, + { + "epoch": 0.3467930181514127, + "grad_norm": 0.6648605494038474, + "learning_rate": 5.308168322110423e-07, + "loss": 1.4774, + "step": 4977 + }, + { + "epoch": 0.3468626972790301, + "grad_norm": 0.777247005511412, + "learning_rate": 5.307557049027582e-07, + "loss": 1.4144, + "step": 4978 + }, + { + "epoch": 0.34693237640664737, + "grad_norm": 0.6929973859668674, + "learning_rate": 5.306945706095615e-07, + "loss": 1.5942, + "step": 4979 + }, + { + "epoch": 0.3470020555342647, + "grad_norm": 0.7283578679121835, + "learning_rate": 5.30633429334382e-07, + "loss": 1.5701, + "step": 4980 + }, + { + "epoch": 0.347071734661882, + "grad_norm": 0.7320143590031736, + "learning_rate": 5.305722810801493e-07, + "loss": 1.5392, + "step": 4981 + }, + { + "epoch": 0.34714141378949936, + "grad_norm": 0.6903230410445872, + "learning_rate": 5.305111258497943e-07, + "loss": 1.6587, + "step": 4982 + }, + { + "epoch": 0.34721109291711666, + "grad_norm": 0.6754268076389439, + "learning_rate": 5.304499636462473e-07, + "loss": 1.5187, + "step": 4983 + }, + { + "epoch": 0.347280772044734, + "grad_norm": 0.7071926402253579, + "learning_rate": 5.303887944724396e-07, + "loss": 1.5039, + "step": 4984 + }, + { + "epoch": 0.3473504511723513, + "grad_norm": 0.7314158959654293, + "learning_rate": 5.303276183313022e-07, + "loss": 1.4893, + "step": 4985 + }, + { + "epoch": 0.34742013029996865, + "grad_norm": 0.717579235163545, + "learning_rate": 5.30266435225767e-07, + "loss": 1.5723, + "step": 4986 + }, + { + "epoch": 0.34748980942758595, + "grad_norm": 0.670827472100565, + "learning_rate": 5.302052451587659e-07, + "loss": 1.4619, + "step": 4987 + }, + { + "epoch": 0.3475594885552033, + "grad_norm": 0.7071414740051439, + "learning_rate": 5.301440481332316e-07, + "loss": 1.6362, + "step": 4988 + }, + { + "epoch": 0.3476291676828206, + "grad_norm": 0.7195762826997261, + "learning_rate": 5.300828441520965e-07, + "loss": 1.5261, + "step": 4989 + }, + { + "epoch": 0.34769884681043794, + "grad_norm": 0.7269680457866935, + "learning_rate": 5.300216332182934e-07, + "loss": 1.5192, + "step": 4990 + }, + { + "epoch": 0.34776852593805524, + "grad_norm": 0.7167959844489366, + "learning_rate": 5.299604153347562e-07, + "loss": 1.5301, + "step": 4991 + }, + { + "epoch": 0.3478382050656726, + "grad_norm": 0.7521845229687258, + "learning_rate": 5.298991905044182e-07, + "loss": 1.5524, + "step": 4992 + }, + { + "epoch": 0.3479078841932899, + "grad_norm": 0.7313888512216582, + "learning_rate": 5.298379587302136e-07, + "loss": 1.5929, + "step": 4993 + }, + { + "epoch": 0.34797756332090723, + "grad_norm": 0.670917170186345, + "learning_rate": 5.297767200150765e-07, + "loss": 1.6268, + "step": 4994 + }, + { + "epoch": 0.34804724244852453, + "grad_norm": 0.7296243936097038, + "learning_rate": 5.297154743619418e-07, + "loss": 1.5761, + "step": 4995 + }, + { + "epoch": 0.3481169215761419, + "grad_norm": 0.6621500834968012, + "learning_rate": 5.296542217737445e-07, + "loss": 1.3975, + "step": 4996 + }, + { + "epoch": 0.3481866007037592, + "grad_norm": 0.6841736192653086, + "learning_rate": 5.2959296225342e-07, + "loss": 1.4708, + "step": 4997 + }, + { + "epoch": 0.3482562798313765, + "grad_norm": 0.7506006835561448, + "learning_rate": 5.295316958039038e-07, + "loss": 1.6007, + "step": 4998 + }, + { + "epoch": 0.3483259589589938, + "grad_norm": 0.7525641487586818, + "learning_rate": 5.294704224281321e-07, + "loss": 1.5337, + "step": 4999 + }, + { + "epoch": 0.34839563808661117, + "grad_norm": 0.8019951480668435, + "learning_rate": 5.294091421290412e-07, + "loss": 1.6959, + "step": 5000 + }, + { + "epoch": 0.34846531721422846, + "grad_norm": 0.9289139139040437, + "learning_rate": 5.293478549095676e-07, + "loss": 1.5733, + "step": 5001 + }, + { + "epoch": 0.3485349963418458, + "grad_norm": 0.7366925407109421, + "learning_rate": 5.292865607726485e-07, + "loss": 1.5289, + "step": 5002 + }, + { + "epoch": 0.3486046754694631, + "grad_norm": 0.6901044411339604, + "learning_rate": 5.292252597212212e-07, + "loss": 1.5065, + "step": 5003 + }, + { + "epoch": 0.34867435459708046, + "grad_norm": 0.7858548036787403, + "learning_rate": 5.291639517582235e-07, + "loss": 1.7448, + "step": 5004 + }, + { + "epoch": 0.34874403372469776, + "grad_norm": 0.6910046587719392, + "learning_rate": 5.291026368865932e-07, + "loss": 1.4958, + "step": 5005 + }, + { + "epoch": 0.3488137128523151, + "grad_norm": 0.7131421363913354, + "learning_rate": 5.290413151092685e-07, + "loss": 1.5104, + "step": 5006 + }, + { + "epoch": 0.3488833919799324, + "grad_norm": 0.678645451016774, + "learning_rate": 5.289799864291884e-07, + "loss": 1.6023, + "step": 5007 + }, + { + "epoch": 0.34895307110754975, + "grad_norm": 0.7135897875529243, + "learning_rate": 5.289186508492918e-07, + "loss": 1.5097, + "step": 5008 + }, + { + "epoch": 0.34902275023516705, + "grad_norm": 0.6925868792365488, + "learning_rate": 5.288573083725181e-07, + "loss": 1.4276, + "step": 5009 + }, + { + "epoch": 0.3490924293627844, + "grad_norm": 0.7355852311495054, + "learning_rate": 5.287959590018066e-07, + "loss": 1.6468, + "step": 5010 + }, + { + "epoch": 0.3491621084904017, + "grad_norm": 0.6591714591613197, + "learning_rate": 5.287346027400978e-07, + "loss": 1.4403, + "step": 5011 + }, + { + "epoch": 0.34923178761801904, + "grad_norm": 0.7384149162418628, + "learning_rate": 5.286732395903316e-07, + "loss": 1.5242, + "step": 5012 + }, + { + "epoch": 0.34930146674563634, + "grad_norm": 0.7951440907862842, + "learning_rate": 5.286118695554488e-07, + "loss": 1.5518, + "step": 5013 + }, + { + "epoch": 0.3493711458732537, + "grad_norm": 0.690403671698083, + "learning_rate": 5.285504926383904e-07, + "loss": 1.4066, + "step": 5014 + }, + { + "epoch": 0.349440825000871, + "grad_norm": 0.734429986958859, + "learning_rate": 5.284891088420977e-07, + "loss": 1.6652, + "step": 5015 + }, + { + "epoch": 0.34951050412848833, + "grad_norm": 0.6791322665246944, + "learning_rate": 5.284277181695124e-07, + "loss": 1.4876, + "step": 5016 + }, + { + "epoch": 0.3495801832561056, + "grad_norm": 0.7333763237125314, + "learning_rate": 5.283663206235762e-07, + "loss": 1.558, + "step": 5017 + }, + { + "epoch": 0.349649862383723, + "grad_norm": 0.7894035079650024, + "learning_rate": 5.283049162072316e-07, + "loss": 1.471, + "step": 5018 + }, + { + "epoch": 0.34971954151134027, + "grad_norm": 0.7444643304296316, + "learning_rate": 5.282435049234214e-07, + "loss": 1.522, + "step": 5019 + }, + { + "epoch": 0.3497892206389576, + "grad_norm": 0.6643970018185514, + "learning_rate": 5.281820867750883e-07, + "loss": 1.5659, + "step": 5020 + }, + { + "epoch": 0.3498588997665749, + "grad_norm": 0.7232823075509024, + "learning_rate": 5.281206617651756e-07, + "loss": 1.5947, + "step": 5021 + }, + { + "epoch": 0.34992857889419227, + "grad_norm": 0.7206660363783639, + "learning_rate": 5.280592298966271e-07, + "loss": 1.5868, + "step": 5022 + }, + { + "epoch": 0.34999825802180956, + "grad_norm": 0.7874148940637059, + "learning_rate": 5.279977911723866e-07, + "loss": 1.6011, + "step": 5023 + }, + { + "epoch": 0.3500679371494269, + "grad_norm": 0.7463976753010189, + "learning_rate": 5.279363455953982e-07, + "loss": 1.6795, + "step": 5024 + }, + { + "epoch": 0.3501376162770442, + "grad_norm": 0.7140576677032477, + "learning_rate": 5.278748931686068e-07, + "loss": 1.4726, + "step": 5025 + }, + { + "epoch": 0.35020729540466156, + "grad_norm": 0.8032460802640788, + "learning_rate": 5.278134338949572e-07, + "loss": 1.5765, + "step": 5026 + }, + { + "epoch": 0.35027697453227885, + "grad_norm": 0.736073108144398, + "learning_rate": 5.277519677773946e-07, + "loss": 1.6496, + "step": 5027 + }, + { + "epoch": 0.3503466536598962, + "grad_norm": 0.7433157359642669, + "learning_rate": 5.276904948188647e-07, + "loss": 1.607, + "step": 5028 + }, + { + "epoch": 0.3504163327875135, + "grad_norm": 0.7302026377592102, + "learning_rate": 5.276290150223133e-07, + "loss": 1.5698, + "step": 5029 + }, + { + "epoch": 0.35048601191513085, + "grad_norm": 0.7156186997183115, + "learning_rate": 5.275675283906867e-07, + "loss": 1.6206, + "step": 5030 + }, + { + "epoch": 0.35055569104274814, + "grad_norm": 0.684118999147647, + "learning_rate": 5.275060349269315e-07, + "loss": 1.5142, + "step": 5031 + }, + { + "epoch": 0.3506253701703655, + "grad_norm": 0.7293360774488382, + "learning_rate": 5.274445346339945e-07, + "loss": 1.5432, + "step": 5032 + }, + { + "epoch": 0.3506950492979828, + "grad_norm": 0.7374356798123075, + "learning_rate": 5.273830275148231e-07, + "loss": 1.547, + "step": 5033 + }, + { + "epoch": 0.35076472842560014, + "grad_norm": 0.6984990019219187, + "learning_rate": 5.273215135723644e-07, + "loss": 1.5509, + "step": 5034 + }, + { + "epoch": 0.35083440755321743, + "grad_norm": 0.6984423015837139, + "learning_rate": 5.272599928095669e-07, + "loss": 1.6198, + "step": 5035 + }, + { + "epoch": 0.3509040866808348, + "grad_norm": 0.7221225311495126, + "learning_rate": 5.271984652293784e-07, + "loss": 1.5365, + "step": 5036 + }, + { + "epoch": 0.3509737658084521, + "grad_norm": 0.6832259001328025, + "learning_rate": 5.271369308347475e-07, + "loss": 1.5944, + "step": 5037 + }, + { + "epoch": 0.3510434449360694, + "grad_norm": 0.7288757483748387, + "learning_rate": 5.27075389628623e-07, + "loss": 1.565, + "step": 5038 + }, + { + "epoch": 0.3511131240636867, + "grad_norm": 0.773464821300105, + "learning_rate": 5.270138416139543e-07, + "loss": 1.5954, + "step": 5039 + }, + { + "epoch": 0.35118280319130407, + "grad_norm": 0.719342206905609, + "learning_rate": 5.269522867936905e-07, + "loss": 1.4334, + "step": 5040 + }, + { + "epoch": 0.35125248231892137, + "grad_norm": 0.7003743350586481, + "learning_rate": 5.268907251707821e-07, + "loss": 1.5568, + "step": 5041 + }, + { + "epoch": 0.3513221614465387, + "grad_norm": 0.6756700295397962, + "learning_rate": 5.268291567481786e-07, + "loss": 1.4975, + "step": 5042 + }, + { + "epoch": 0.351391840574156, + "grad_norm": 0.7101359625406757, + "learning_rate": 5.267675815288307e-07, + "loss": 1.5818, + "step": 5043 + }, + { + "epoch": 0.3514615197017733, + "grad_norm": 0.7656785077162607, + "learning_rate": 5.267059995156894e-07, + "loss": 1.4525, + "step": 5044 + }, + { + "epoch": 0.35153119882939066, + "grad_norm": 0.7206756017968566, + "learning_rate": 5.266444107117056e-07, + "loss": 1.5277, + "step": 5045 + }, + { + "epoch": 0.35160087795700795, + "grad_norm": 0.7893166371200515, + "learning_rate": 5.265828151198307e-07, + "loss": 1.7029, + "step": 5046 + }, + { + "epoch": 0.3516705570846253, + "grad_norm": 0.7090526225797813, + "learning_rate": 5.265212127430169e-07, + "loss": 1.5572, + "step": 5047 + }, + { + "epoch": 0.3517402362122426, + "grad_norm": 0.7587655099716046, + "learning_rate": 5.264596035842158e-07, + "loss": 1.4987, + "step": 5048 + }, + { + "epoch": 0.35180991533985995, + "grad_norm": 0.6922136341394148, + "learning_rate": 5.263979876463804e-07, + "loss": 1.4961, + "step": 5049 + }, + { + "epoch": 0.35187959446747724, + "grad_norm": 0.7018301647037124, + "learning_rate": 5.263363649324629e-07, + "loss": 1.5998, + "step": 5050 + }, + { + "epoch": 0.3519492735950946, + "grad_norm": 0.7100948833017958, + "learning_rate": 5.262747354454167e-07, + "loss": 1.6238, + "step": 5051 + }, + { + "epoch": 0.3520189527227119, + "grad_norm": 0.6878432194296483, + "learning_rate": 5.262130991881952e-07, + "loss": 1.5418, + "step": 5052 + }, + { + "epoch": 0.35208863185032924, + "grad_norm": 0.739369196023084, + "learning_rate": 5.26151456163752e-07, + "loss": 1.5088, + "step": 5053 + }, + { + "epoch": 0.35215831097794653, + "grad_norm": 0.7327168663945245, + "learning_rate": 5.260898063750413e-07, + "loss": 1.5453, + "step": 5054 + }, + { + "epoch": 0.3522279901055639, + "grad_norm": 0.7494770476194448, + "learning_rate": 5.260281498250174e-07, + "loss": 1.5667, + "step": 5055 + }, + { + "epoch": 0.3522976692331812, + "grad_norm": 0.7003511050728188, + "learning_rate": 5.25966486516635e-07, + "loss": 1.4696, + "step": 5056 + }, + { + "epoch": 0.3523673483607985, + "grad_norm": 0.682006424136004, + "learning_rate": 5.259048164528492e-07, + "loss": 1.5554, + "step": 5057 + }, + { + "epoch": 0.3524370274884158, + "grad_norm": 0.7261598036764687, + "learning_rate": 5.258431396366154e-07, + "loss": 1.5927, + "step": 5058 + }, + { + "epoch": 0.35250670661603317, + "grad_norm": 0.7304805813607402, + "learning_rate": 5.257814560708891e-07, + "loss": 1.5012, + "step": 5059 + }, + { + "epoch": 0.35257638574365047, + "grad_norm": 0.701653516106193, + "learning_rate": 5.257197657586264e-07, + "loss": 1.4971, + "step": 5060 + }, + { + "epoch": 0.3526460648712678, + "grad_norm": 0.7665366559302987, + "learning_rate": 5.256580687027837e-07, + "loss": 1.5559, + "step": 5061 + }, + { + "epoch": 0.3527157439988851, + "grad_norm": 0.6825767430611114, + "learning_rate": 5.255963649063176e-07, + "loss": 1.6248, + "step": 5062 + }, + { + "epoch": 0.35278542312650246, + "grad_norm": 0.7166776263768039, + "learning_rate": 5.255346543721849e-07, + "loss": 1.5674, + "step": 5063 + }, + { + "epoch": 0.35285510225411976, + "grad_norm": 0.6688400141973269, + "learning_rate": 5.254729371033433e-07, + "loss": 1.5386, + "step": 5064 + }, + { + "epoch": 0.3529247813817371, + "grad_norm": 0.7341742905821448, + "learning_rate": 5.2541121310275e-07, + "loss": 1.5394, + "step": 5065 + }, + { + "epoch": 0.3529944605093544, + "grad_norm": 0.7139855856855144, + "learning_rate": 5.25349482373363e-07, + "loss": 1.5032, + "step": 5066 + }, + { + "epoch": 0.35306413963697175, + "grad_norm": 0.7377531592463131, + "learning_rate": 5.252877449181409e-07, + "loss": 1.6183, + "step": 5067 + }, + { + "epoch": 0.35313381876458905, + "grad_norm": 0.6992765237751483, + "learning_rate": 5.25226000740042e-07, + "loss": 1.5182, + "step": 5068 + }, + { + "epoch": 0.3532034978922064, + "grad_norm": 0.8121552367033361, + "learning_rate": 5.251642498420253e-07, + "loss": 1.5733, + "step": 5069 + }, + { + "epoch": 0.3532731770198237, + "grad_norm": 0.7225162064445212, + "learning_rate": 5.251024922270498e-07, + "loss": 1.6745, + "step": 5070 + }, + { + "epoch": 0.35334285614744104, + "grad_norm": 0.7220584431415679, + "learning_rate": 5.250407278980753e-07, + "loss": 1.6101, + "step": 5071 + }, + { + "epoch": 0.35341253527505834, + "grad_norm": 0.7338355042291222, + "learning_rate": 5.249789568580618e-07, + "loss": 1.522, + "step": 5072 + }, + { + "epoch": 0.3534822144026757, + "grad_norm": 0.6781864282217835, + "learning_rate": 5.249171791099692e-07, + "loss": 1.4984, + "step": 5073 + }, + { + "epoch": 0.353551893530293, + "grad_norm": 0.7579599868943648, + "learning_rate": 5.248553946567581e-07, + "loss": 1.6135, + "step": 5074 + }, + { + "epoch": 0.35362157265791033, + "grad_norm": 0.740562375568996, + "learning_rate": 5.247936035013895e-07, + "loss": 1.5535, + "step": 5075 + }, + { + "epoch": 0.3536912517855276, + "grad_norm": 0.6684099288907697, + "learning_rate": 5.247318056468243e-07, + "loss": 1.5008, + "step": 5076 + }, + { + "epoch": 0.353760930913145, + "grad_norm": 0.6961783128265661, + "learning_rate": 5.246700010960242e-07, + "loss": 1.3987, + "step": 5077 + }, + { + "epoch": 0.3538306100407623, + "grad_norm": 0.7742250152885914, + "learning_rate": 5.246081898519508e-07, + "loss": 1.4998, + "step": 5078 + }, + { + "epoch": 0.3539002891683796, + "grad_norm": 0.6974290524914329, + "learning_rate": 5.245463719175663e-07, + "loss": 1.4631, + "step": 5079 + }, + { + "epoch": 0.3539699682959969, + "grad_norm": 0.7394144947732425, + "learning_rate": 5.244845472958334e-07, + "loss": 1.6719, + "step": 5080 + }, + { + "epoch": 0.35403964742361427, + "grad_norm": 0.7306324314278728, + "learning_rate": 5.244227159897145e-07, + "loss": 1.4915, + "step": 5081 + }, + { + "epoch": 0.35410932655123156, + "grad_norm": 0.742438727557881, + "learning_rate": 5.243608780021729e-07, + "loss": 1.5282, + "step": 5082 + }, + { + "epoch": 0.3541790056788489, + "grad_norm": 0.7491554405441359, + "learning_rate": 5.242990333361718e-07, + "loss": 1.4513, + "step": 5083 + }, + { + "epoch": 0.3542486848064662, + "grad_norm": 0.7174024426449469, + "learning_rate": 5.242371819946751e-07, + "loss": 1.4468, + "step": 5084 + }, + { + "epoch": 0.35431836393408356, + "grad_norm": 0.6926765842468269, + "learning_rate": 5.241753239806468e-07, + "loss": 1.6201, + "step": 5085 + }, + { + "epoch": 0.35438804306170085, + "grad_norm": 0.6878017168512975, + "learning_rate": 5.241134592970512e-07, + "loss": 1.4277, + "step": 5086 + }, + { + "epoch": 0.3544577221893182, + "grad_norm": 0.7167457202174242, + "learning_rate": 5.24051587946853e-07, + "loss": 1.4938, + "step": 5087 + }, + { + "epoch": 0.3545274013169355, + "grad_norm": 0.6964352455294602, + "learning_rate": 5.239897099330175e-07, + "loss": 1.6145, + "step": 5088 + }, + { + "epoch": 0.35459708044455285, + "grad_norm": 0.6974430427662163, + "learning_rate": 5.239278252585096e-07, + "loss": 1.4977, + "step": 5089 + }, + { + "epoch": 0.35466675957217014, + "grad_norm": 0.6855351540727951, + "learning_rate": 5.23865933926295e-07, + "loss": 1.5572, + "step": 5090 + }, + { + "epoch": 0.3547364386997875, + "grad_norm": 0.7201666934289629, + "learning_rate": 5.238040359393399e-07, + "loss": 1.6433, + "step": 5091 + }, + { + "epoch": 0.3548061178274048, + "grad_norm": 0.7277989750215804, + "learning_rate": 5.237421313006103e-07, + "loss": 1.536, + "step": 5092 + }, + { + "epoch": 0.35487579695502214, + "grad_norm": 0.6634341796241707, + "learning_rate": 5.236802200130731e-07, + "loss": 1.4078, + "step": 5093 + }, + { + "epoch": 0.35494547608263943, + "grad_norm": 0.7052333618869744, + "learning_rate": 5.23618302079695e-07, + "loss": 1.4558, + "step": 5094 + }, + { + "epoch": 0.3550151552102568, + "grad_norm": 0.7597778563956493, + "learning_rate": 5.235563775034431e-07, + "loss": 1.5985, + "step": 5095 + }, + { + "epoch": 0.3550848343378741, + "grad_norm": 0.7706519972191562, + "learning_rate": 5.234944462872853e-07, + "loss": 1.4622, + "step": 5096 + }, + { + "epoch": 0.35515451346549143, + "grad_norm": 0.7272603173122851, + "learning_rate": 5.234325084341893e-07, + "loss": 1.4587, + "step": 5097 + }, + { + "epoch": 0.3552241925931087, + "grad_norm": 0.7338825767979815, + "learning_rate": 5.233705639471233e-07, + "loss": 1.7211, + "step": 5098 + }, + { + "epoch": 0.3552938717207261, + "grad_norm": 0.7069141949193741, + "learning_rate": 5.233086128290559e-07, + "loss": 1.4935, + "step": 5099 + }, + { + "epoch": 0.35536355084834337, + "grad_norm": 0.6440283358521028, + "learning_rate": 5.232466550829557e-07, + "loss": 1.391, + "step": 5100 + }, + { + "epoch": 0.3554332299759607, + "grad_norm": 0.7555141567575172, + "learning_rate": 5.231846907117919e-07, + "loss": 1.6087, + "step": 5101 + }, + { + "epoch": 0.355502909103578, + "grad_norm": 0.7162415478121203, + "learning_rate": 5.231227197185342e-07, + "loss": 1.5383, + "step": 5102 + }, + { + "epoch": 0.35557258823119536, + "grad_norm": 0.731361586378461, + "learning_rate": 5.230607421061522e-07, + "loss": 1.4528, + "step": 5103 + }, + { + "epoch": 0.35564226735881266, + "grad_norm": 0.7519791657504464, + "learning_rate": 5.22998757877616e-07, + "loss": 1.5298, + "step": 5104 + }, + { + "epoch": 0.35571194648643, + "grad_norm": 0.7158025830973831, + "learning_rate": 5.22936767035896e-07, + "loss": 1.494, + "step": 5105 + }, + { + "epoch": 0.3557816256140473, + "grad_norm": 0.723748809683638, + "learning_rate": 5.228747695839628e-07, + "loss": 1.6235, + "step": 5106 + }, + { + "epoch": 0.35585130474166465, + "grad_norm": 0.690675333061282, + "learning_rate": 5.228127655247878e-07, + "loss": 1.5151, + "step": 5107 + }, + { + "epoch": 0.35592098386928195, + "grad_norm": 0.7119021276041587, + "learning_rate": 5.227507548613421e-07, + "loss": 1.5736, + "step": 5108 + }, + { + "epoch": 0.3559906629968993, + "grad_norm": 0.7377717095815133, + "learning_rate": 5.226887375965974e-07, + "loss": 1.6437, + "step": 5109 + }, + { + "epoch": 0.3560603421245166, + "grad_norm": 0.6893478182531352, + "learning_rate": 5.226267137335256e-07, + "loss": 1.4684, + "step": 5110 + }, + { + "epoch": 0.35613002125213394, + "grad_norm": 0.7497735677098524, + "learning_rate": 5.225646832750993e-07, + "loss": 1.6747, + "step": 5111 + }, + { + "epoch": 0.35619970037975124, + "grad_norm": 0.7102877627281015, + "learning_rate": 5.225026462242909e-07, + "loss": 1.5834, + "step": 5112 + }, + { + "epoch": 0.3562693795073686, + "grad_norm": 0.7598662761974709, + "learning_rate": 5.224406025840734e-07, + "loss": 1.5745, + "step": 5113 + }, + { + "epoch": 0.3563390586349859, + "grad_norm": 0.690830365671091, + "learning_rate": 5.223785523574201e-07, + "loss": 1.5295, + "step": 5114 + }, + { + "epoch": 0.35640873776260323, + "grad_norm": 0.7349684708292723, + "learning_rate": 5.223164955473045e-07, + "loss": 1.739, + "step": 5115 + }, + { + "epoch": 0.35647841689022053, + "grad_norm": 0.7050326698210355, + "learning_rate": 5.222544321567006e-07, + "loss": 1.4603, + "step": 5116 + }, + { + "epoch": 0.3565480960178379, + "grad_norm": 0.744957758056315, + "learning_rate": 5.221923621885824e-07, + "loss": 1.6169, + "step": 5117 + }, + { + "epoch": 0.3566177751454552, + "grad_norm": 0.7146331562092086, + "learning_rate": 5.221302856459247e-07, + "loss": 1.5036, + "step": 5118 + }, + { + "epoch": 0.3566874542730725, + "grad_norm": 0.7084897847748508, + "learning_rate": 5.220682025317022e-07, + "loss": 1.5428, + "step": 5119 + }, + { + "epoch": 0.3567571334006898, + "grad_norm": 0.768911401979755, + "learning_rate": 5.220061128488898e-07, + "loss": 1.4999, + "step": 5120 + }, + { + "epoch": 0.35682681252830717, + "grad_norm": 0.7568365121723222, + "learning_rate": 5.219440166004635e-07, + "loss": 1.48, + "step": 5121 + }, + { + "epoch": 0.35689649165592446, + "grad_norm": 0.754288265747282, + "learning_rate": 5.218819137893987e-07, + "loss": 1.6032, + "step": 5122 + }, + { + "epoch": 0.3569661707835418, + "grad_norm": 0.7313637670227049, + "learning_rate": 5.218198044186714e-07, + "loss": 1.528, + "step": 5123 + }, + { + "epoch": 0.3570358499111591, + "grad_norm": 0.7476164464044086, + "learning_rate": 5.217576884912583e-07, + "loss": 1.41, + "step": 5124 + }, + { + "epoch": 0.35710552903877646, + "grad_norm": 0.7170855090682579, + "learning_rate": 5.216955660101362e-07, + "loss": 1.4268, + "step": 5125 + }, + { + "epoch": 0.35717520816639375, + "grad_norm": 0.7142896772084316, + "learning_rate": 5.216334369782816e-07, + "loss": 1.5978, + "step": 5126 + }, + { + "epoch": 0.3572448872940111, + "grad_norm": 0.7358415994670768, + "learning_rate": 5.215713013986725e-07, + "loss": 1.4269, + "step": 5127 + }, + { + "epoch": 0.3573145664216284, + "grad_norm": 0.7153570108585909, + "learning_rate": 5.215091592742861e-07, + "loss": 1.5301, + "step": 5128 + }, + { + "epoch": 0.35738424554924575, + "grad_norm": 0.7447373193332415, + "learning_rate": 5.214470106081006e-07, + "loss": 1.5988, + "step": 5129 + }, + { + "epoch": 0.35745392467686304, + "grad_norm": 0.7478977331229874, + "learning_rate": 5.213848554030942e-07, + "loss": 1.5723, + "step": 5130 + }, + { + "epoch": 0.3575236038044804, + "grad_norm": 0.6830375549123991, + "learning_rate": 5.213226936622456e-07, + "loss": 1.5305, + "step": 5131 + }, + { + "epoch": 0.3575932829320977, + "grad_norm": 0.7308156057281666, + "learning_rate": 5.212605253885334e-07, + "loss": 1.5576, + "step": 5132 + }, + { + "epoch": 0.35766296205971504, + "grad_norm": 0.6936867476883012, + "learning_rate": 5.211983505849374e-07, + "loss": 1.4978, + "step": 5133 + }, + { + "epoch": 0.35773264118733233, + "grad_norm": 0.7707040281173906, + "learning_rate": 5.211361692544366e-07, + "loss": 1.5393, + "step": 5134 + }, + { + "epoch": 0.3578023203149497, + "grad_norm": 0.7024155180503071, + "learning_rate": 5.210739814000112e-07, + "loss": 1.5802, + "step": 5135 + }, + { + "epoch": 0.357871999442567, + "grad_norm": 0.7394402935977779, + "learning_rate": 5.210117870246413e-07, + "loss": 1.5669, + "step": 5136 + }, + { + "epoch": 0.3579416785701843, + "grad_norm": 0.8847985010981289, + "learning_rate": 5.209495861313073e-07, + "loss": 1.6315, + "step": 5137 + }, + { + "epoch": 0.3580113576978016, + "grad_norm": 0.6932795155767438, + "learning_rate": 5.208873787229901e-07, + "loss": 1.5393, + "step": 5138 + }, + { + "epoch": 0.3580810368254189, + "grad_norm": 0.7381773345843451, + "learning_rate": 5.208251648026706e-07, + "loss": 1.4413, + "step": 5139 + }, + { + "epoch": 0.35815071595303627, + "grad_norm": 0.7475210673169992, + "learning_rate": 5.207629443733305e-07, + "loss": 1.5271, + "step": 5140 + }, + { + "epoch": 0.35822039508065356, + "grad_norm": 0.6821769502467249, + "learning_rate": 5.207007174379512e-07, + "loss": 1.3818, + "step": 5141 + }, + { + "epoch": 0.3582900742082709, + "grad_norm": 0.7540507964274135, + "learning_rate": 5.206384839995151e-07, + "loss": 1.6267, + "step": 5142 + }, + { + "epoch": 0.3583597533358882, + "grad_norm": 0.7189622914498229, + "learning_rate": 5.205762440610043e-07, + "loss": 1.465, + "step": 5143 + }, + { + "epoch": 0.35842943246350556, + "grad_norm": 0.7217356184482514, + "learning_rate": 5.205139976254017e-07, + "loss": 1.3787, + "step": 5144 + }, + { + "epoch": 0.35849911159112285, + "grad_norm": 0.727465508665904, + "learning_rate": 5.204517446956899e-07, + "loss": 1.5033, + "step": 5145 + }, + { + "epoch": 0.3585687907187402, + "grad_norm": 0.7359901406990895, + "learning_rate": 5.203894852748525e-07, + "loss": 1.593, + "step": 5146 + }, + { + "epoch": 0.3586384698463575, + "grad_norm": 0.7027153744924136, + "learning_rate": 5.203272193658731e-07, + "loss": 1.5325, + "step": 5147 + }, + { + "epoch": 0.35870814897397485, + "grad_norm": 0.7257847487940218, + "learning_rate": 5.202649469717355e-07, + "loss": 1.6293, + "step": 5148 + }, + { + "epoch": 0.35877782810159214, + "grad_norm": 0.7370632429557675, + "learning_rate": 5.202026680954239e-07, + "loss": 1.5385, + "step": 5149 + }, + { + "epoch": 0.3588475072292095, + "grad_norm": 0.6892655930676715, + "learning_rate": 5.201403827399229e-07, + "loss": 1.4966, + "step": 5150 + }, + { + "epoch": 0.3589171863568268, + "grad_norm": 0.6988742124124329, + "learning_rate": 5.200780909082172e-07, + "loss": 1.4691, + "step": 5151 + }, + { + "epoch": 0.35898686548444414, + "grad_norm": 0.760811010008508, + "learning_rate": 5.200157926032923e-07, + "loss": 1.6241, + "step": 5152 + }, + { + "epoch": 0.35905654461206143, + "grad_norm": 0.7212598757194405, + "learning_rate": 5.199534878281334e-07, + "loss": 1.5798, + "step": 5153 + }, + { + "epoch": 0.3591262237396788, + "grad_norm": 0.7060841907531608, + "learning_rate": 5.198911765857262e-07, + "loss": 1.5388, + "step": 5154 + }, + { + "epoch": 0.3591959028672961, + "grad_norm": 0.7837864246835402, + "learning_rate": 5.198288588790569e-07, + "loss": 1.5413, + "step": 5155 + }, + { + "epoch": 0.35926558199491343, + "grad_norm": 0.7216877555907196, + "learning_rate": 5.197665347111119e-07, + "loss": 1.5262, + "step": 5156 + }, + { + "epoch": 0.3593352611225307, + "grad_norm": 0.804075556812053, + "learning_rate": 5.19704204084878e-07, + "loss": 1.5869, + "step": 5157 + }, + { + "epoch": 0.3594049402501481, + "grad_norm": 0.7217025913661967, + "learning_rate": 5.19641867003342e-07, + "loss": 1.5733, + "step": 5158 + }, + { + "epoch": 0.35947461937776537, + "grad_norm": 0.6582452744365979, + "learning_rate": 5.195795234694912e-07, + "loss": 1.6061, + "step": 5159 + }, + { + "epoch": 0.3595442985053827, + "grad_norm": 0.7737437639055407, + "learning_rate": 5.195171734863135e-07, + "loss": 1.5443, + "step": 5160 + }, + { + "epoch": 0.359613977633, + "grad_norm": 0.7250214544500239, + "learning_rate": 5.194548170567967e-07, + "loss": 1.5808, + "step": 5161 + }, + { + "epoch": 0.35968365676061737, + "grad_norm": 0.6857607962575639, + "learning_rate": 5.19392454183929e-07, + "loss": 1.4723, + "step": 5162 + }, + { + "epoch": 0.35975333588823466, + "grad_norm": 0.6663403161373356, + "learning_rate": 5.19330084870699e-07, + "loss": 1.4604, + "step": 5163 + }, + { + "epoch": 0.359823015015852, + "grad_norm": 0.7785922329800811, + "learning_rate": 5.192677091200955e-07, + "loss": 1.5168, + "step": 5164 + }, + { + "epoch": 0.3598926941434693, + "grad_norm": 0.69477412998931, + "learning_rate": 5.192053269351078e-07, + "loss": 1.5047, + "step": 5165 + }, + { + "epoch": 0.35996237327108666, + "grad_norm": 0.7145866167341475, + "learning_rate": 5.191429383187252e-07, + "loss": 1.7196, + "step": 5166 + }, + { + "epoch": 0.36003205239870395, + "grad_norm": 0.8216013176843149, + "learning_rate": 5.190805432739378e-07, + "loss": 1.6393, + "step": 5167 + }, + { + "epoch": 0.3601017315263213, + "grad_norm": 0.7794476392517875, + "learning_rate": 5.190181418037354e-07, + "loss": 1.4782, + "step": 5168 + }, + { + "epoch": 0.3601714106539386, + "grad_norm": 0.6724797600349622, + "learning_rate": 5.189557339111084e-07, + "loss": 1.4706, + "step": 5169 + }, + { + "epoch": 0.36024108978155595, + "grad_norm": 0.7334712166799854, + "learning_rate": 5.188933195990478e-07, + "loss": 1.6242, + "step": 5170 + }, + { + "epoch": 0.36031076890917324, + "grad_norm": 0.7208270609468629, + "learning_rate": 5.188308988705443e-07, + "loss": 1.5976, + "step": 5171 + }, + { + "epoch": 0.3603804480367906, + "grad_norm": 0.8019105316309917, + "learning_rate": 5.187684717285897e-07, + "loss": 1.5433, + "step": 5172 + }, + { + "epoch": 0.3604501271644079, + "grad_norm": 0.7227317280229342, + "learning_rate": 5.187060381761749e-07, + "loss": 1.6401, + "step": 5173 + }, + { + "epoch": 0.36051980629202524, + "grad_norm": 0.7261853178782461, + "learning_rate": 5.186435982162924e-07, + "loss": 1.6069, + "step": 5174 + }, + { + "epoch": 0.36058948541964253, + "grad_norm": 0.731713234324846, + "learning_rate": 5.185811518519344e-07, + "loss": 1.4512, + "step": 5175 + }, + { + "epoch": 0.3606591645472599, + "grad_norm": 0.6683510801178882, + "learning_rate": 5.185186990860932e-07, + "loss": 1.4891, + "step": 5176 + }, + { + "epoch": 0.3607288436748772, + "grad_norm": 0.7496938877958745, + "learning_rate": 5.184562399217621e-07, + "loss": 1.4466, + "step": 5177 + }, + { + "epoch": 0.3607985228024945, + "grad_norm": 0.733168705759652, + "learning_rate": 5.183937743619337e-07, + "loss": 1.5857, + "step": 5178 + }, + { + "epoch": 0.3608682019301118, + "grad_norm": 0.7430949705009791, + "learning_rate": 5.18331302409602e-07, + "loss": 1.5315, + "step": 5179 + }, + { + "epoch": 0.36093788105772917, + "grad_norm": 0.7773572657707958, + "learning_rate": 5.182688240677605e-07, + "loss": 1.5201, + "step": 5180 + }, + { + "epoch": 0.36100756018534647, + "grad_norm": 0.7536228512940022, + "learning_rate": 5.182063393394033e-07, + "loss": 1.5811, + "step": 5181 + }, + { + "epoch": 0.3610772393129638, + "grad_norm": 0.6928947809986236, + "learning_rate": 5.181438482275249e-07, + "loss": 1.4702, + "step": 5182 + }, + { + "epoch": 0.3611469184405811, + "grad_norm": 0.7192975853115967, + "learning_rate": 5.1808135073512e-07, + "loss": 1.451, + "step": 5183 + }, + { + "epoch": 0.36121659756819846, + "grad_norm": 0.693654126443667, + "learning_rate": 5.180188468651835e-07, + "loss": 1.4676, + "step": 5184 + }, + { + "epoch": 0.36128627669581576, + "grad_norm": 0.7256757119654305, + "learning_rate": 5.179563366207108e-07, + "loss": 1.6045, + "step": 5185 + }, + { + "epoch": 0.3613559558234331, + "grad_norm": 0.6875294331327294, + "learning_rate": 5.178938200046974e-07, + "loss": 1.513, + "step": 5186 + }, + { + "epoch": 0.3614256349510504, + "grad_norm": 0.739944485913754, + "learning_rate": 5.178312970201394e-07, + "loss": 1.4669, + "step": 5187 + }, + { + "epoch": 0.36149531407866775, + "grad_norm": 0.6832797355399127, + "learning_rate": 5.17768767670033e-07, + "loss": 1.4283, + "step": 5188 + }, + { + "epoch": 0.36156499320628505, + "grad_norm": 0.6876615223406541, + "learning_rate": 5.177062319573746e-07, + "loss": 1.5305, + "step": 5189 + }, + { + "epoch": 0.3616346723339024, + "grad_norm": 0.701091374244738, + "learning_rate": 5.176436898851611e-07, + "loss": 1.4617, + "step": 5190 + }, + { + "epoch": 0.3617043514615197, + "grad_norm": 0.723571517664063, + "learning_rate": 5.175811414563897e-07, + "loss": 1.5976, + "step": 5191 + }, + { + "epoch": 0.36177403058913704, + "grad_norm": 0.7259446504081081, + "learning_rate": 5.17518586674058e-07, + "loss": 1.6222, + "step": 5192 + }, + { + "epoch": 0.36184370971675434, + "grad_norm": 0.7117002981964684, + "learning_rate": 5.174560255411634e-07, + "loss": 1.5056, + "step": 5193 + }, + { + "epoch": 0.3619133888443717, + "grad_norm": 0.7279042262192607, + "learning_rate": 5.173934580607041e-07, + "loss": 1.6258, + "step": 5194 + }, + { + "epoch": 0.361983067971989, + "grad_norm": 0.7104886630481225, + "learning_rate": 5.173308842356785e-07, + "loss": 1.5851, + "step": 5195 + }, + { + "epoch": 0.36205274709960633, + "grad_norm": 0.6871358125170827, + "learning_rate": 5.172683040690853e-07, + "loss": 1.5191, + "step": 5196 + }, + { + "epoch": 0.3621224262272236, + "grad_norm": 0.7192980784986389, + "learning_rate": 5.172057175639234e-07, + "loss": 1.6021, + "step": 5197 + }, + { + "epoch": 0.362192105354841, + "grad_norm": 0.6654693323072031, + "learning_rate": 5.171431247231921e-07, + "loss": 1.5323, + "step": 5198 + }, + { + "epoch": 0.36226178448245827, + "grad_norm": 0.7188297136953314, + "learning_rate": 5.17080525549891e-07, + "loss": 1.6149, + "step": 5199 + }, + { + "epoch": 0.3623314636100756, + "grad_norm": 0.7728183408864958, + "learning_rate": 5.170179200470199e-07, + "loss": 1.6164, + "step": 5200 + }, + { + "epoch": 0.3624011427376929, + "grad_norm": 0.7382453104563416, + "learning_rate": 5.169553082175792e-07, + "loss": 1.5468, + "step": 5201 + }, + { + "epoch": 0.36247082186531027, + "grad_norm": 0.7664313372593619, + "learning_rate": 5.16892690064569e-07, + "loss": 1.5538, + "step": 5202 + }, + { + "epoch": 0.36254050099292756, + "grad_norm": 0.7403909821642547, + "learning_rate": 5.168300655909905e-07, + "loss": 1.5414, + "step": 5203 + }, + { + "epoch": 0.3626101801205449, + "grad_norm": 0.7610636702682715, + "learning_rate": 5.167674347998446e-07, + "loss": 1.6746, + "step": 5204 + }, + { + "epoch": 0.3626798592481622, + "grad_norm": 0.7008708794040567, + "learning_rate": 5.167047976941327e-07, + "loss": 1.4871, + "step": 5205 + }, + { + "epoch": 0.36274953837577956, + "grad_norm": 0.6827672523246823, + "learning_rate": 5.166421542768564e-07, + "loss": 1.5816, + "step": 5206 + }, + { + "epoch": 0.36281921750339685, + "grad_norm": 0.7860676509331375, + "learning_rate": 5.165795045510179e-07, + "loss": 1.5773, + "step": 5207 + }, + { + "epoch": 0.3628888966310142, + "grad_norm": 0.7178660737374489, + "learning_rate": 5.165168485196194e-07, + "loss": 1.466, + "step": 5208 + }, + { + "epoch": 0.3629585757586315, + "grad_norm": 0.7093634056240848, + "learning_rate": 5.164541861856636e-07, + "loss": 1.3871, + "step": 5209 + }, + { + "epoch": 0.36302825488624885, + "grad_norm": 0.6972079077503902, + "learning_rate": 5.163915175521532e-07, + "loss": 1.449, + "step": 5210 + }, + { + "epoch": 0.36309793401386614, + "grad_norm": 0.747745907435438, + "learning_rate": 5.163288426220918e-07, + "loss": 1.5072, + "step": 5211 + }, + { + "epoch": 0.3631676131414835, + "grad_norm": 0.733427906317132, + "learning_rate": 5.162661613984823e-07, + "loss": 1.5162, + "step": 5212 + }, + { + "epoch": 0.3632372922691008, + "grad_norm": 0.7233838960156477, + "learning_rate": 5.162034738843291e-07, + "loss": 1.66, + "step": 5213 + }, + { + "epoch": 0.36330697139671814, + "grad_norm": 0.7357691997602305, + "learning_rate": 5.161407800826359e-07, + "loss": 1.6048, + "step": 5214 + }, + { + "epoch": 0.36337665052433543, + "grad_norm": 0.7419119044198975, + "learning_rate": 5.160780799964074e-07, + "loss": 1.5819, + "step": 5215 + }, + { + "epoch": 0.3634463296519528, + "grad_norm": 0.7333257219084258, + "learning_rate": 5.160153736286481e-07, + "loss": 1.5136, + "step": 5216 + }, + { + "epoch": 0.3635160087795701, + "grad_norm": 0.7561954088937832, + "learning_rate": 5.159526609823633e-07, + "loss": 1.532, + "step": 5217 + }, + { + "epoch": 0.3635856879071874, + "grad_norm": 0.7266071236575034, + "learning_rate": 5.15889942060558e-07, + "loss": 1.6245, + "step": 5218 + }, + { + "epoch": 0.3636553670348047, + "grad_norm": 0.7365917391566773, + "learning_rate": 5.15827216866238e-07, + "loss": 1.6322, + "step": 5219 + }, + { + "epoch": 0.36372504616242207, + "grad_norm": 0.7040763190740109, + "learning_rate": 5.157644854024093e-07, + "loss": 1.49, + "step": 5220 + }, + { + "epoch": 0.36379472529003937, + "grad_norm": 0.7142262462047746, + "learning_rate": 5.15701747672078e-07, + "loss": 1.5597, + "step": 5221 + }, + { + "epoch": 0.3638644044176567, + "grad_norm": 0.6858746531213153, + "learning_rate": 5.156390036782504e-07, + "loss": 1.5639, + "step": 5222 + }, + { + "epoch": 0.363934083545274, + "grad_norm": 0.6759937525398312, + "learning_rate": 5.155762534239337e-07, + "loss": 1.5822, + "step": 5223 + }, + { + "epoch": 0.36400376267289136, + "grad_norm": 0.7509133714833661, + "learning_rate": 5.155134969121349e-07, + "loss": 1.5394, + "step": 5224 + }, + { + "epoch": 0.36407344180050866, + "grad_norm": 0.7170835709751526, + "learning_rate": 5.154507341458613e-07, + "loss": 1.5826, + "step": 5225 + }, + { + "epoch": 0.364143120928126, + "grad_norm": 0.7065609504484022, + "learning_rate": 5.153879651281208e-07, + "loss": 1.5208, + "step": 5226 + }, + { + "epoch": 0.3642128000557433, + "grad_norm": 0.6860528121460813, + "learning_rate": 5.153251898619212e-07, + "loss": 1.5345, + "step": 5227 + }, + { + "epoch": 0.3642824791833606, + "grad_norm": 0.8400742404674034, + "learning_rate": 5.152624083502711e-07, + "loss": 1.4601, + "step": 5228 + }, + { + "epoch": 0.36435215831097795, + "grad_norm": 0.7068731002179544, + "learning_rate": 5.151996205961789e-07, + "loss": 1.5634, + "step": 5229 + }, + { + "epoch": 0.36442183743859524, + "grad_norm": 0.682862701091438, + "learning_rate": 5.151368266026535e-07, + "loss": 1.5034, + "step": 5230 + }, + { + "epoch": 0.3644915165662126, + "grad_norm": 0.740431488301115, + "learning_rate": 5.150740263727043e-07, + "loss": 1.6257, + "step": 5231 + }, + { + "epoch": 0.3645611956938299, + "grad_norm": 0.6693329964335967, + "learning_rate": 5.150112199093407e-07, + "loss": 1.4643, + "step": 5232 + }, + { + "epoch": 0.36463087482144724, + "grad_norm": 0.7470365567305248, + "learning_rate": 5.149484072155725e-07, + "loss": 1.5566, + "step": 5233 + }, + { + "epoch": 0.36470055394906453, + "grad_norm": 0.7473497564826113, + "learning_rate": 5.1488558829441e-07, + "loss": 1.4871, + "step": 5234 + }, + { + "epoch": 0.3647702330766819, + "grad_norm": 0.6911032507873867, + "learning_rate": 5.148227631488632e-07, + "loss": 1.4324, + "step": 5235 + }, + { + "epoch": 0.3648399122042992, + "grad_norm": 0.6816650784423868, + "learning_rate": 5.147599317819434e-07, + "loss": 1.5158, + "step": 5236 + }, + { + "epoch": 0.3649095913319165, + "grad_norm": 0.7410861631268022, + "learning_rate": 5.14697094196661e-07, + "loss": 1.5026, + "step": 5237 + }, + { + "epoch": 0.3649792704595338, + "grad_norm": 0.6936484383081856, + "learning_rate": 5.146342503960276e-07, + "loss": 1.5705, + "step": 5238 + }, + { + "epoch": 0.3650489495871512, + "grad_norm": 0.6786652114843855, + "learning_rate": 5.14571400383055e-07, + "loss": 1.5508, + "step": 5239 + }, + { + "epoch": 0.36511862871476847, + "grad_norm": 0.7273301299834638, + "learning_rate": 5.145085441607548e-07, + "loss": 1.4541, + "step": 5240 + }, + { + "epoch": 0.3651883078423858, + "grad_norm": 0.7934277087687172, + "learning_rate": 5.144456817321391e-07, + "loss": 1.5811, + "step": 5241 + }, + { + "epoch": 0.3652579869700031, + "grad_norm": 0.7129202993259002, + "learning_rate": 5.143828131002207e-07, + "loss": 1.6266, + "step": 5242 + }, + { + "epoch": 0.36532766609762046, + "grad_norm": 0.662891493603992, + "learning_rate": 5.143199382680122e-07, + "loss": 1.4635, + "step": 5243 + }, + { + "epoch": 0.36539734522523776, + "grad_norm": 0.7154475325660717, + "learning_rate": 5.14257057238527e-07, + "loss": 1.5247, + "step": 5244 + }, + { + "epoch": 0.3654670243528551, + "grad_norm": 0.7066260380609907, + "learning_rate": 5.141941700147782e-07, + "loss": 1.5311, + "step": 5245 + }, + { + "epoch": 0.3655367034804724, + "grad_norm": 0.6989214833213997, + "learning_rate": 5.141312765997793e-07, + "loss": 1.2925, + "step": 5246 + }, + { + "epoch": 0.36560638260808975, + "grad_norm": 0.7044441524364802, + "learning_rate": 5.140683769965448e-07, + "loss": 1.61, + "step": 5247 + }, + { + "epoch": 0.36567606173570705, + "grad_norm": 0.6607704920750958, + "learning_rate": 5.140054712080886e-07, + "loss": 1.444, + "step": 5248 + }, + { + "epoch": 0.3657457408633244, + "grad_norm": 0.7357333603325759, + "learning_rate": 5.139425592374253e-07, + "loss": 1.5409, + "step": 5249 + }, + { + "epoch": 0.3658154199909417, + "grad_norm": 0.6514266738332384, + "learning_rate": 5.138796410875699e-07, + "loss": 1.4404, + "step": 5250 + }, + { + "epoch": 0.36588509911855904, + "grad_norm": 0.6992053094386647, + "learning_rate": 5.138167167615376e-07, + "loss": 1.5441, + "step": 5251 + }, + { + "epoch": 0.36595477824617634, + "grad_norm": 0.7186567293557066, + "learning_rate": 5.137537862623437e-07, + "loss": 1.5629, + "step": 5252 + }, + { + "epoch": 0.3660244573737937, + "grad_norm": 0.7064096116081595, + "learning_rate": 5.136908495930041e-07, + "loss": 1.353, + "step": 5253 + }, + { + "epoch": 0.366094136501411, + "grad_norm": 0.694342282292615, + "learning_rate": 5.136279067565347e-07, + "loss": 1.6093, + "step": 5254 + }, + { + "epoch": 0.36616381562902833, + "grad_norm": 0.695106825117824, + "learning_rate": 5.135649577559519e-07, + "loss": 1.53, + "step": 5255 + }, + { + "epoch": 0.36623349475664563, + "grad_norm": 0.6721407293211693, + "learning_rate": 5.135020025942726e-07, + "loss": 1.5514, + "step": 5256 + }, + { + "epoch": 0.366303173884263, + "grad_norm": 0.7433143801168884, + "learning_rate": 5.134390412745134e-07, + "loss": 1.6664, + "step": 5257 + }, + { + "epoch": 0.3663728530118803, + "grad_norm": 0.7759049168554641, + "learning_rate": 5.133760737996916e-07, + "loss": 1.6917, + "step": 5258 + }, + { + "epoch": 0.3664425321394976, + "grad_norm": 0.7287911458113379, + "learning_rate": 5.133131001728248e-07, + "loss": 1.6547, + "step": 5259 + }, + { + "epoch": 0.3665122112671149, + "grad_norm": 0.7351234554888396, + "learning_rate": 5.132501203969309e-07, + "loss": 1.5529, + "step": 5260 + }, + { + "epoch": 0.36658189039473227, + "grad_norm": 0.7453393320312598, + "learning_rate": 5.131871344750279e-07, + "loss": 1.5328, + "step": 5261 + }, + { + "epoch": 0.36665156952234956, + "grad_norm": 0.6918766579616535, + "learning_rate": 5.131241424101342e-07, + "loss": 1.5461, + "step": 5262 + }, + { + "epoch": 0.3667212486499669, + "grad_norm": 0.7194318357060998, + "learning_rate": 5.130611442052686e-07, + "loss": 1.6265, + "step": 5263 + }, + { + "epoch": 0.3667909277775842, + "grad_norm": 0.7300076987686653, + "learning_rate": 5.129981398634502e-07, + "loss": 1.5558, + "step": 5264 + }, + { + "epoch": 0.36686060690520156, + "grad_norm": 0.7009279205137522, + "learning_rate": 5.129351293876979e-07, + "loss": 1.4367, + "step": 5265 + }, + { + "epoch": 0.36693028603281885, + "grad_norm": 0.8297625632738989, + "learning_rate": 5.128721127810318e-07, + "loss": 1.7773, + "step": 5266 + }, + { + "epoch": 0.3669999651604362, + "grad_norm": 0.6939962844924814, + "learning_rate": 5.128090900464713e-07, + "loss": 1.5233, + "step": 5267 + }, + { + "epoch": 0.3670696442880535, + "grad_norm": 0.7366202778643759, + "learning_rate": 5.12746061187037e-07, + "loss": 1.6129, + "step": 5268 + }, + { + "epoch": 0.36713932341567085, + "grad_norm": 0.7558579968863058, + "learning_rate": 5.126830262057491e-07, + "loss": 1.5278, + "step": 5269 + }, + { + "epoch": 0.36720900254328814, + "grad_norm": 0.718857236429046, + "learning_rate": 5.126199851056285e-07, + "loss": 1.4735, + "step": 5270 + }, + { + "epoch": 0.3672786816709055, + "grad_norm": 0.7972347547964148, + "learning_rate": 5.125569378896962e-07, + "loss": 1.6827, + "step": 5271 + }, + { + "epoch": 0.3673483607985228, + "grad_norm": 0.6899496063426461, + "learning_rate": 5.124938845609736e-07, + "loss": 1.58, + "step": 5272 + }, + { + "epoch": 0.36741803992614014, + "grad_norm": 0.7818174794056979, + "learning_rate": 5.124308251224824e-07, + "loss": 1.5645, + "step": 5273 + }, + { + "epoch": 0.36748771905375743, + "grad_norm": 0.7124659133872496, + "learning_rate": 5.123677595772443e-07, + "loss": 1.5349, + "step": 5274 + }, + { + "epoch": 0.3675573981813748, + "grad_norm": 0.7511031303630104, + "learning_rate": 5.123046879282817e-07, + "loss": 1.5803, + "step": 5275 + }, + { + "epoch": 0.3676270773089921, + "grad_norm": 0.6851223267795419, + "learning_rate": 5.122416101786171e-07, + "loss": 1.5072, + "step": 5276 + }, + { + "epoch": 0.36769675643660943, + "grad_norm": 0.806850869793742, + "learning_rate": 5.121785263312734e-07, + "loss": 1.6434, + "step": 5277 + }, + { + "epoch": 0.3677664355642267, + "grad_norm": 0.7254644946888305, + "learning_rate": 5.121154363892735e-07, + "loss": 1.5579, + "step": 5278 + }, + { + "epoch": 0.3678361146918441, + "grad_norm": 0.7252493972658083, + "learning_rate": 5.12052340355641e-07, + "loss": 1.5835, + "step": 5279 + }, + { + "epoch": 0.36790579381946137, + "grad_norm": 0.7662299719465424, + "learning_rate": 5.119892382333996e-07, + "loss": 1.5148, + "step": 5280 + }, + { + "epoch": 0.3679754729470787, + "grad_norm": 0.7165871659616716, + "learning_rate": 5.119261300255731e-07, + "loss": 1.5447, + "step": 5281 + }, + { + "epoch": 0.368045152074696, + "grad_norm": 0.6639950924743402, + "learning_rate": 5.118630157351859e-07, + "loss": 1.554, + "step": 5282 + }, + { + "epoch": 0.36811483120231336, + "grad_norm": 0.7302210295216659, + "learning_rate": 5.117998953652625e-07, + "loss": 1.5355, + "step": 5283 + }, + { + "epoch": 0.36818451032993066, + "grad_norm": 0.7491105839545834, + "learning_rate": 5.117367689188279e-07, + "loss": 1.6475, + "step": 5284 + }, + { + "epoch": 0.368254189457548, + "grad_norm": 0.7506643071993163, + "learning_rate": 5.11673636398907e-07, + "loss": 1.6575, + "step": 5285 + }, + { + "epoch": 0.3683238685851653, + "grad_norm": 0.6812138419849934, + "learning_rate": 5.116104978085254e-07, + "loss": 1.4553, + "step": 5286 + }, + { + "epoch": 0.36839354771278265, + "grad_norm": 0.7854900925563119, + "learning_rate": 5.11547353150709e-07, + "loss": 1.5519, + "step": 5287 + }, + { + "epoch": 0.36846322684039995, + "grad_norm": 0.7203783971396994, + "learning_rate": 5.114842024284834e-07, + "loss": 1.6115, + "step": 5288 + }, + { + "epoch": 0.3685329059680173, + "grad_norm": 0.7243845386114148, + "learning_rate": 5.114210456448753e-07, + "loss": 1.5813, + "step": 5289 + }, + { + "epoch": 0.3686025850956346, + "grad_norm": 0.713318574120179, + "learning_rate": 5.113578828029111e-07, + "loss": 1.5824, + "step": 5290 + }, + { + "epoch": 0.36867226422325194, + "grad_norm": 0.7536484451835456, + "learning_rate": 5.112947139056177e-07, + "loss": 1.5102, + "step": 5291 + }, + { + "epoch": 0.36874194335086924, + "grad_norm": 0.7081609936285251, + "learning_rate": 5.112315389560226e-07, + "loss": 1.4296, + "step": 5292 + }, + { + "epoch": 0.3688116224784866, + "grad_norm": 0.6799279862368306, + "learning_rate": 5.111683579571528e-07, + "loss": 1.5083, + "step": 5293 + }, + { + "epoch": 0.3688813016061039, + "grad_norm": 0.890442651971021, + "learning_rate": 5.111051709120361e-07, + "loss": 1.6717, + "step": 5294 + }, + { + "epoch": 0.36895098073372123, + "grad_norm": 0.6787398243474793, + "learning_rate": 5.11041977823701e-07, + "loss": 1.5357, + "step": 5295 + }, + { + "epoch": 0.36902065986133853, + "grad_norm": 0.6934921564234446, + "learning_rate": 5.109787786951755e-07, + "loss": 1.5096, + "step": 5296 + }, + { + "epoch": 0.3690903389889559, + "grad_norm": 0.6968589312259752, + "learning_rate": 5.109155735294882e-07, + "loss": 1.4032, + "step": 5297 + }, + { + "epoch": 0.3691600181165732, + "grad_norm": 0.819070605787555, + "learning_rate": 5.108523623296682e-07, + "loss": 1.5379, + "step": 5298 + }, + { + "epoch": 0.3692296972441905, + "grad_norm": 0.7192037379184335, + "learning_rate": 5.107891450987445e-07, + "loss": 1.5684, + "step": 5299 + }, + { + "epoch": 0.3692993763718078, + "grad_norm": 0.7157852409802509, + "learning_rate": 5.107259218397469e-07, + "loss": 1.5831, + "step": 5300 + }, + { + "epoch": 0.36936905549942517, + "grad_norm": 0.7492510878850005, + "learning_rate": 5.10662692555705e-07, + "loss": 1.5437, + "step": 5301 + }, + { + "epoch": 0.36943873462704246, + "grad_norm": 0.7286317846247485, + "learning_rate": 5.105994572496488e-07, + "loss": 1.5993, + "step": 5302 + }, + { + "epoch": 0.3695084137546598, + "grad_norm": 0.7648274962276388, + "learning_rate": 5.105362159246089e-07, + "loss": 1.6396, + "step": 5303 + }, + { + "epoch": 0.3695780928822771, + "grad_norm": 0.7420079599949096, + "learning_rate": 5.104729685836156e-07, + "loss": 1.6214, + "step": 5304 + }, + { + "epoch": 0.36964777200989446, + "grad_norm": 0.692661841317161, + "learning_rate": 5.104097152297002e-07, + "loss": 1.502, + "step": 5305 + }, + { + "epoch": 0.36971745113751175, + "grad_norm": 0.6671121171209912, + "learning_rate": 5.103464558658937e-07, + "loss": 1.484, + "step": 5306 + }, + { + "epoch": 0.3697871302651291, + "grad_norm": 0.698266599735423, + "learning_rate": 5.102831904952278e-07, + "loss": 1.5238, + "step": 5307 + }, + { + "epoch": 0.3698568093927464, + "grad_norm": 0.721316187644341, + "learning_rate": 5.102199191207341e-07, + "loss": 1.5747, + "step": 5308 + }, + { + "epoch": 0.36992648852036375, + "grad_norm": 0.7555051596431045, + "learning_rate": 5.10156641745445e-07, + "loss": 1.4998, + "step": 5309 + }, + { + "epoch": 0.36999616764798104, + "grad_norm": 0.7378153963986179, + "learning_rate": 5.100933583723924e-07, + "loss": 1.5894, + "step": 5310 + }, + { + "epoch": 0.3700658467755984, + "grad_norm": 0.7299552421467299, + "learning_rate": 5.100300690046095e-07, + "loss": 1.4737, + "step": 5311 + }, + { + "epoch": 0.3701355259032157, + "grad_norm": 0.7222764229223232, + "learning_rate": 5.099667736451289e-07, + "loss": 1.534, + "step": 5312 + }, + { + "epoch": 0.37020520503083304, + "grad_norm": 0.6816801189726973, + "learning_rate": 5.099034722969839e-07, + "loss": 1.473, + "step": 5313 + }, + { + "epoch": 0.37027488415845033, + "grad_norm": 0.6548112913225141, + "learning_rate": 5.098401649632083e-07, + "loss": 1.4501, + "step": 5314 + }, + { + "epoch": 0.3703445632860677, + "grad_norm": 0.6909094462608453, + "learning_rate": 5.097768516468354e-07, + "loss": 1.6325, + "step": 5315 + }, + { + "epoch": 0.370414242413685, + "grad_norm": 0.745114108887267, + "learning_rate": 5.097135323508998e-07, + "loss": 1.4192, + "step": 5316 + }, + { + "epoch": 0.37048392154130233, + "grad_norm": 0.7094163660025467, + "learning_rate": 5.096502070784358e-07, + "loss": 1.5063, + "step": 5317 + }, + { + "epoch": 0.3705536006689196, + "grad_norm": 0.6850285597934616, + "learning_rate": 5.095868758324777e-07, + "loss": 1.5345, + "step": 5318 + }, + { + "epoch": 0.3706232797965369, + "grad_norm": 0.689281312430016, + "learning_rate": 5.095235386160609e-07, + "loss": 1.4471, + "step": 5319 + }, + { + "epoch": 0.37069295892415427, + "grad_norm": 0.6915170853407353, + "learning_rate": 5.094601954322205e-07, + "loss": 1.5534, + "step": 5320 + }, + { + "epoch": 0.37076263805177156, + "grad_norm": 0.7047793583439973, + "learning_rate": 5.09396846283992e-07, + "loss": 1.4784, + "step": 5321 + }, + { + "epoch": 0.3708323171793889, + "grad_norm": 0.6663159334094043, + "learning_rate": 5.093334911744112e-07, + "loss": 1.4606, + "step": 5322 + }, + { + "epoch": 0.3709019963070062, + "grad_norm": 0.7150983337368044, + "learning_rate": 5.092701301065143e-07, + "loss": 1.5738, + "step": 5323 + }, + { + "epoch": 0.37097167543462356, + "grad_norm": 0.7746113045884914, + "learning_rate": 5.092067630833376e-07, + "loss": 1.5458, + "step": 5324 + }, + { + "epoch": 0.37104135456224085, + "grad_norm": 0.726557554264369, + "learning_rate": 5.091433901079178e-07, + "loss": 1.5695, + "step": 5325 + }, + { + "epoch": 0.3711110336898582, + "grad_norm": 0.7372150181593171, + "learning_rate": 5.09080011183292e-07, + "loss": 1.5355, + "step": 5326 + }, + { + "epoch": 0.3711807128174755, + "grad_norm": 0.6940705110176282, + "learning_rate": 5.090166263124972e-07, + "loss": 1.6131, + "step": 5327 + }, + { + "epoch": 0.37125039194509285, + "grad_norm": 0.6706521663019233, + "learning_rate": 5.089532354985712e-07, + "loss": 1.4164, + "step": 5328 + }, + { + "epoch": 0.37132007107271015, + "grad_norm": 0.7522207421573274, + "learning_rate": 5.088898387445516e-07, + "loss": 1.6635, + "step": 5329 + }, + { + "epoch": 0.3713897502003275, + "grad_norm": 0.7628608894232416, + "learning_rate": 5.088264360534766e-07, + "loss": 1.4664, + "step": 5330 + }, + { + "epoch": 0.3714594293279448, + "grad_norm": 0.7187722963470405, + "learning_rate": 5.087630274283846e-07, + "loss": 1.4531, + "step": 5331 + }, + { + "epoch": 0.37152910845556214, + "grad_norm": 0.6801481781174705, + "learning_rate": 5.086996128723142e-07, + "loss": 1.4878, + "step": 5332 + }, + { + "epoch": 0.37159878758317944, + "grad_norm": 0.7645501547843065, + "learning_rate": 5.086361923883045e-07, + "loss": 1.4992, + "step": 5333 + }, + { + "epoch": 0.3716684667107968, + "grad_norm": 0.711836763750871, + "learning_rate": 5.085727659793944e-07, + "loss": 1.4532, + "step": 5334 + }, + { + "epoch": 0.3717381458384141, + "grad_norm": 0.712627458827013, + "learning_rate": 5.085093336486239e-07, + "loss": 1.5743, + "step": 5335 + }, + { + "epoch": 0.37180782496603143, + "grad_norm": 0.6921801368603905, + "learning_rate": 5.084458953990325e-07, + "loss": 1.5584, + "step": 5336 + }, + { + "epoch": 0.3718775040936487, + "grad_norm": 0.7227212206111542, + "learning_rate": 5.083824512336604e-07, + "loss": 1.4999, + "step": 5337 + }, + { + "epoch": 0.3719471832212661, + "grad_norm": 0.7828431028231742, + "learning_rate": 5.083190011555478e-07, + "loss": 1.6068, + "step": 5338 + }, + { + "epoch": 0.37201686234888337, + "grad_norm": 0.6628573302677304, + "learning_rate": 5.082555451677356e-07, + "loss": 1.5878, + "step": 5339 + }, + { + "epoch": 0.3720865414765007, + "grad_norm": 0.6802179570076206, + "learning_rate": 5.081920832732647e-07, + "loss": 1.4912, + "step": 5340 + }, + { + "epoch": 0.372156220604118, + "grad_norm": 0.728418405218629, + "learning_rate": 5.081286154751763e-07, + "loss": 1.5246, + "step": 5341 + }, + { + "epoch": 0.37222589973173537, + "grad_norm": 0.697994481641133, + "learning_rate": 5.080651417765117e-07, + "loss": 1.4452, + "step": 5342 + }, + { + "epoch": 0.37229557885935266, + "grad_norm": 0.7055931388578118, + "learning_rate": 5.080016621803128e-07, + "loss": 1.4946, + "step": 5343 + }, + { + "epoch": 0.37236525798697, + "grad_norm": 0.7065869550826593, + "learning_rate": 5.079381766896219e-07, + "loss": 1.6107, + "step": 5344 + }, + { + "epoch": 0.3724349371145873, + "grad_norm": 0.7410511984098631, + "learning_rate": 5.078746853074811e-07, + "loss": 1.5031, + "step": 5345 + }, + { + "epoch": 0.37250461624220466, + "grad_norm": 0.7901975002588718, + "learning_rate": 5.078111880369331e-07, + "loss": 1.7529, + "step": 5346 + }, + { + "epoch": 0.37257429536982195, + "grad_norm": 0.6930274846449783, + "learning_rate": 5.07747684881021e-07, + "loss": 1.649, + "step": 5347 + }, + { + "epoch": 0.3726439744974393, + "grad_norm": 0.8041954663566083, + "learning_rate": 5.076841758427877e-07, + "loss": 1.6462, + "step": 5348 + }, + { + "epoch": 0.3727136536250566, + "grad_norm": 0.7305914510607037, + "learning_rate": 5.07620660925277e-07, + "loss": 1.4538, + "step": 5349 + }, + { + "epoch": 0.37278333275267395, + "grad_norm": 0.7158559747595667, + "learning_rate": 5.075571401315321e-07, + "loss": 1.4756, + "step": 5350 + }, + { + "epoch": 0.37285301188029124, + "grad_norm": 0.7073775288961177, + "learning_rate": 5.074936134645978e-07, + "loss": 1.5127, + "step": 5351 + }, + { + "epoch": 0.3729226910079086, + "grad_norm": 0.7430699639842753, + "learning_rate": 5.074300809275181e-07, + "loss": 1.5496, + "step": 5352 + }, + { + "epoch": 0.3729923701355259, + "grad_norm": 0.7641833104167214, + "learning_rate": 5.073665425233374e-07, + "loss": 1.4541, + "step": 5353 + }, + { + "epoch": 0.37306204926314324, + "grad_norm": 0.7180671749293043, + "learning_rate": 5.073029982551009e-07, + "loss": 1.5463, + "step": 5354 + }, + { + "epoch": 0.37313172839076053, + "grad_norm": 0.6599128583126527, + "learning_rate": 5.072394481258535e-07, + "loss": 1.3976, + "step": 5355 + }, + { + "epoch": 0.3732014075183779, + "grad_norm": 0.7688131761567236, + "learning_rate": 5.071758921386409e-07, + "loss": 1.5714, + "step": 5356 + }, + { + "epoch": 0.3732710866459952, + "grad_norm": 0.7072828006472733, + "learning_rate": 5.071123302965086e-07, + "loss": 1.5208, + "step": 5357 + }, + { + "epoch": 0.3733407657736125, + "grad_norm": 0.7187426792443632, + "learning_rate": 5.070487626025027e-07, + "loss": 1.6456, + "step": 5358 + }, + { + "epoch": 0.3734104449012298, + "grad_norm": 0.7180845966367558, + "learning_rate": 5.069851890596698e-07, + "loss": 1.5107, + "step": 5359 + }, + { + "epoch": 0.37348012402884717, + "grad_norm": 0.6708545748911989, + "learning_rate": 5.06921609671056e-07, + "loss": 1.4611, + "step": 5360 + }, + { + "epoch": 0.37354980315646447, + "grad_norm": 0.7028485609419693, + "learning_rate": 5.068580244397085e-07, + "loss": 1.4579, + "step": 5361 + }, + { + "epoch": 0.3736194822840818, + "grad_norm": 0.6713681496765652, + "learning_rate": 5.067944333686743e-07, + "loss": 1.4134, + "step": 5362 + }, + { + "epoch": 0.3736891614116991, + "grad_norm": 0.6680925713652732, + "learning_rate": 5.067308364610006e-07, + "loss": 1.5172, + "step": 5363 + }, + { + "epoch": 0.37375884053931646, + "grad_norm": 0.7842330472219948, + "learning_rate": 5.066672337197354e-07, + "loss": 1.5071, + "step": 5364 + }, + { + "epoch": 0.37382851966693376, + "grad_norm": 0.7628791637111324, + "learning_rate": 5.066036251479266e-07, + "loss": 1.5212, + "step": 5365 + }, + { + "epoch": 0.3738981987945511, + "grad_norm": 0.6813664665615757, + "learning_rate": 5.065400107486226e-07, + "loss": 1.5094, + "step": 5366 + }, + { + "epoch": 0.3739678779221684, + "grad_norm": 0.7181594596254843, + "learning_rate": 5.064763905248716e-07, + "loss": 1.4813, + "step": 5367 + }, + { + "epoch": 0.37403755704978575, + "grad_norm": 0.7657310789918401, + "learning_rate": 5.064127644797227e-07, + "loss": 1.6721, + "step": 5368 + }, + { + "epoch": 0.37410723617740305, + "grad_norm": 0.7271926630736468, + "learning_rate": 5.063491326162248e-07, + "loss": 1.5984, + "step": 5369 + }, + { + "epoch": 0.3741769153050204, + "grad_norm": 0.7156351904619168, + "learning_rate": 5.062854949374273e-07, + "loss": 1.6321, + "step": 5370 + }, + { + "epoch": 0.3742465944326377, + "grad_norm": 0.6973806824549705, + "learning_rate": 5.062218514463799e-07, + "loss": 1.5303, + "step": 5371 + }, + { + "epoch": 0.37431627356025504, + "grad_norm": 0.7268627075391219, + "learning_rate": 5.061582021461325e-07, + "loss": 1.6009, + "step": 5372 + }, + { + "epoch": 0.37438595268787234, + "grad_norm": 0.7435202270708391, + "learning_rate": 5.060945470397354e-07, + "loss": 1.5263, + "step": 5373 + }, + { + "epoch": 0.3744556318154897, + "grad_norm": 0.7061226191508438, + "learning_rate": 5.060308861302389e-07, + "loss": 1.5312, + "step": 5374 + }, + { + "epoch": 0.374525310943107, + "grad_norm": 0.6935908957909679, + "learning_rate": 5.059672194206938e-07, + "loss": 1.4977, + "step": 5375 + }, + { + "epoch": 0.37459499007072433, + "grad_norm": 0.8044608362031292, + "learning_rate": 5.059035469141515e-07, + "loss": 1.5661, + "step": 5376 + }, + { + "epoch": 0.3746646691983416, + "grad_norm": 0.6649746679278241, + "learning_rate": 5.058398686136628e-07, + "loss": 1.5039, + "step": 5377 + }, + { + "epoch": 0.374734348325959, + "grad_norm": 0.7565675239565676, + "learning_rate": 5.057761845222795e-07, + "loss": 1.5466, + "step": 5378 + }, + { + "epoch": 0.37480402745357627, + "grad_norm": 0.6988450148686387, + "learning_rate": 5.057124946430535e-07, + "loss": 1.524, + "step": 5379 + }, + { + "epoch": 0.3748737065811936, + "grad_norm": 0.6789677269566424, + "learning_rate": 5.056487989790371e-07, + "loss": 1.5258, + "step": 5380 + }, + { + "epoch": 0.3749433857088109, + "grad_norm": 0.7113732310328157, + "learning_rate": 5.055850975332823e-07, + "loss": 1.5414, + "step": 5381 + }, + { + "epoch": 0.37501306483642827, + "grad_norm": 0.6976581810589285, + "learning_rate": 5.055213903088424e-07, + "loss": 1.5575, + "step": 5382 + }, + { + "epoch": 0.37508274396404556, + "grad_norm": 0.6854275337810213, + "learning_rate": 5.054576773087697e-07, + "loss": 1.5032, + "step": 5383 + }, + { + "epoch": 0.3751524230916629, + "grad_norm": 0.7527863161147542, + "learning_rate": 5.05393958536118e-07, + "loss": 1.5894, + "step": 5384 + }, + { + "epoch": 0.3752221022192802, + "grad_norm": 0.6747886537395279, + "learning_rate": 5.053302339939407e-07, + "loss": 1.4749, + "step": 5385 + }, + { + "epoch": 0.37529178134689756, + "grad_norm": 0.6837579373025371, + "learning_rate": 5.052665036852914e-07, + "loss": 1.5364, + "step": 5386 + }, + { + "epoch": 0.37536146047451485, + "grad_norm": 0.7873639942673621, + "learning_rate": 5.052027676132245e-07, + "loss": 1.7117, + "step": 5387 + }, + { + "epoch": 0.3754311396021322, + "grad_norm": 0.7919474795693494, + "learning_rate": 5.051390257807941e-07, + "loss": 1.6618, + "step": 5388 + }, + { + "epoch": 0.3755008187297495, + "grad_norm": 0.7103425061197322, + "learning_rate": 5.05075278191055e-07, + "loss": 1.6743, + "step": 5389 + }, + { + "epoch": 0.37557049785736685, + "grad_norm": 0.7161819553104973, + "learning_rate": 5.05011524847062e-07, + "loss": 1.566, + "step": 5390 + }, + { + "epoch": 0.37564017698498414, + "grad_norm": 0.7334385643113612, + "learning_rate": 5.049477657518704e-07, + "loss": 1.5287, + "step": 5391 + }, + { + "epoch": 0.3757098561126015, + "grad_norm": 0.7572351651031275, + "learning_rate": 5.048840009085356e-07, + "loss": 1.5927, + "step": 5392 + }, + { + "epoch": 0.3757795352402188, + "grad_norm": 0.7124113602854409, + "learning_rate": 5.048202303201134e-07, + "loss": 1.5382, + "step": 5393 + }, + { + "epoch": 0.37584921436783614, + "grad_norm": 0.6855279268109093, + "learning_rate": 5.047564539896597e-07, + "loss": 1.489, + "step": 5394 + }, + { + "epoch": 0.37591889349545343, + "grad_norm": 0.7122062077638511, + "learning_rate": 5.046926719202309e-07, + "loss": 1.4946, + "step": 5395 + }, + { + "epoch": 0.3759885726230708, + "grad_norm": 0.7091079314088186, + "learning_rate": 5.046288841148835e-07, + "loss": 1.6135, + "step": 5396 + }, + { + "epoch": 0.3760582517506881, + "grad_norm": 0.6740079747894211, + "learning_rate": 5.045650905766745e-07, + "loss": 1.553, + "step": 5397 + }, + { + "epoch": 0.3761279308783054, + "grad_norm": 0.6702936126540127, + "learning_rate": 5.045012913086607e-07, + "loss": 1.4964, + "step": 5398 + }, + { + "epoch": 0.3761976100059227, + "grad_norm": 0.6985064459949539, + "learning_rate": 5.044374863138998e-07, + "loss": 1.4901, + "step": 5399 + }, + { + "epoch": 0.3762672891335401, + "grad_norm": 0.7301075792224525, + "learning_rate": 5.043736755954493e-07, + "loss": 1.5574, + "step": 5400 + }, + { + "epoch": 0.37633696826115737, + "grad_norm": 0.749172825026058, + "learning_rate": 5.043098591563673e-07, + "loss": 1.5362, + "step": 5401 + }, + { + "epoch": 0.3764066473887747, + "grad_norm": 0.7094520771114469, + "learning_rate": 5.042460369997119e-07, + "loss": 1.5059, + "step": 5402 + }, + { + "epoch": 0.376476326516392, + "grad_norm": 0.7207987192677225, + "learning_rate": 5.041822091285415e-07, + "loss": 1.4182, + "step": 5403 + }, + { + "epoch": 0.37654600564400936, + "grad_norm": 0.7707547850189875, + "learning_rate": 5.041183755459151e-07, + "loss": 1.619, + "step": 5404 + }, + { + "epoch": 0.37661568477162666, + "grad_norm": 0.6830636923143021, + "learning_rate": 5.040545362548915e-07, + "loss": 1.5678, + "step": 5405 + }, + { + "epoch": 0.376685363899244, + "grad_norm": 0.732725900627606, + "learning_rate": 5.039906912585302e-07, + "loss": 1.5759, + "step": 5406 + }, + { + "epoch": 0.3767550430268613, + "grad_norm": 0.7422161056576552, + "learning_rate": 5.039268405598906e-07, + "loss": 1.4853, + "step": 5407 + }, + { + "epoch": 0.37682472215447865, + "grad_norm": 0.7256022665248492, + "learning_rate": 5.038629841620327e-07, + "loss": 1.5363, + "step": 5408 + }, + { + "epoch": 0.37689440128209595, + "grad_norm": 0.6892575042707771, + "learning_rate": 5.037991220680166e-07, + "loss": 1.6261, + "step": 5409 + }, + { + "epoch": 0.3769640804097133, + "grad_norm": 0.6804116027311646, + "learning_rate": 5.037352542809025e-07, + "loss": 1.5511, + "step": 5410 + }, + { + "epoch": 0.3770337595373306, + "grad_norm": 0.7689597898444267, + "learning_rate": 5.036713808037515e-07, + "loss": 1.5802, + "step": 5411 + }, + { + "epoch": 0.3771034386649479, + "grad_norm": 0.6964237941047433, + "learning_rate": 5.036075016396242e-07, + "loss": 1.4965, + "step": 5412 + }, + { + "epoch": 0.37717311779256524, + "grad_norm": 0.7435068845298077, + "learning_rate": 5.035436167915819e-07, + "loss": 1.5212, + "step": 5413 + }, + { + "epoch": 0.37724279692018253, + "grad_norm": 0.7349257406525216, + "learning_rate": 5.034797262626861e-07, + "loss": 1.4656, + "step": 5414 + }, + { + "epoch": 0.3773124760477999, + "grad_norm": 0.6882927821800732, + "learning_rate": 5.034158300559986e-07, + "loss": 1.5055, + "step": 5415 + }, + { + "epoch": 0.3773821551754172, + "grad_norm": 0.7189670232247855, + "learning_rate": 5.033519281745815e-07, + "loss": 1.6791, + "step": 5416 + }, + { + "epoch": 0.37745183430303453, + "grad_norm": 0.7508315288135808, + "learning_rate": 5.032880206214968e-07, + "loss": 1.5512, + "step": 5417 + }, + { + "epoch": 0.3775215134306518, + "grad_norm": 0.7366512193101076, + "learning_rate": 5.032241073998076e-07, + "loss": 1.5568, + "step": 5418 + }, + { + "epoch": 0.3775911925582692, + "grad_norm": 0.6640423914093597, + "learning_rate": 5.031601885125763e-07, + "loss": 1.487, + "step": 5419 + }, + { + "epoch": 0.37766087168588647, + "grad_norm": 0.6553708894293232, + "learning_rate": 5.030962639628663e-07, + "loss": 1.3905, + "step": 5420 + }, + { + "epoch": 0.3777305508135038, + "grad_norm": 0.738643876142048, + "learning_rate": 5.030323337537408e-07, + "loss": 1.5277, + "step": 5421 + }, + { + "epoch": 0.3778002299411211, + "grad_norm": 0.6964778026790148, + "learning_rate": 5.029683978882635e-07, + "loss": 1.4764, + "step": 5422 + }, + { + "epoch": 0.37786990906873846, + "grad_norm": 0.7305960630310033, + "learning_rate": 5.029044563694985e-07, + "loss": 1.5922, + "step": 5423 + }, + { + "epoch": 0.37793958819635576, + "grad_norm": 0.7575001757201274, + "learning_rate": 5.028405092005098e-07, + "loss": 1.5197, + "step": 5424 + }, + { + "epoch": 0.3780092673239731, + "grad_norm": 0.6845732719679103, + "learning_rate": 5.02776556384362e-07, + "loss": 1.6039, + "step": 5425 + }, + { + "epoch": 0.3780789464515904, + "grad_norm": 0.7388090099334985, + "learning_rate": 5.0271259792412e-07, + "loss": 1.4985, + "step": 5426 + }, + { + "epoch": 0.37814862557920775, + "grad_norm": 0.6893090869935965, + "learning_rate": 5.026486338228484e-07, + "loss": 1.5317, + "step": 5427 + }, + { + "epoch": 0.37821830470682505, + "grad_norm": 0.7481567809018613, + "learning_rate": 5.025846640836129e-07, + "loss": 1.5365, + "step": 5428 + }, + { + "epoch": 0.3782879838344424, + "grad_norm": 0.7477294337521692, + "learning_rate": 5.025206887094789e-07, + "loss": 1.6067, + "step": 5429 + }, + { + "epoch": 0.3783576629620597, + "grad_norm": 0.702638609667168, + "learning_rate": 5.02456707703512e-07, + "loss": 1.5787, + "step": 5430 + }, + { + "epoch": 0.37842734208967704, + "grad_norm": 0.6619744765586045, + "learning_rate": 5.023927210687788e-07, + "loss": 1.5176, + "step": 5431 + }, + { + "epoch": 0.37849702121729434, + "grad_norm": 0.6782202426529567, + "learning_rate": 5.023287288083453e-07, + "loss": 1.3905, + "step": 5432 + }, + { + "epoch": 0.3785667003449117, + "grad_norm": 0.7090382071085164, + "learning_rate": 5.022647309252783e-07, + "loss": 1.548, + "step": 5433 + }, + { + "epoch": 0.378636379472529, + "grad_norm": 0.8299623002044955, + "learning_rate": 5.022007274226446e-07, + "loss": 1.6024, + "step": 5434 + }, + { + "epoch": 0.37870605860014633, + "grad_norm": 0.7610548390668402, + "learning_rate": 5.021367183035114e-07, + "loss": 1.6044, + "step": 5435 + }, + { + "epoch": 0.37877573772776363, + "grad_norm": 0.7751485981500597, + "learning_rate": 5.020727035709463e-07, + "loss": 1.5679, + "step": 5436 + }, + { + "epoch": 0.378845416855381, + "grad_norm": 0.7063453389555776, + "learning_rate": 5.020086832280168e-07, + "loss": 1.5361, + "step": 5437 + }, + { + "epoch": 0.3789150959829983, + "grad_norm": 0.6902667458465007, + "learning_rate": 5.019446572777909e-07, + "loss": 1.5486, + "step": 5438 + }, + { + "epoch": 0.3789847751106156, + "grad_norm": 0.6808435381311487, + "learning_rate": 5.018806257233372e-07, + "loss": 1.5548, + "step": 5439 + }, + { + "epoch": 0.3790544542382329, + "grad_norm": 0.715213686595762, + "learning_rate": 5.018165885677238e-07, + "loss": 1.4849, + "step": 5440 + }, + { + "epoch": 0.37912413336585027, + "grad_norm": 0.6856158512743733, + "learning_rate": 5.017525458140196e-07, + "loss": 1.4843, + "step": 5441 + }, + { + "epoch": 0.37919381249346756, + "grad_norm": 0.7319426863719979, + "learning_rate": 5.016884974652937e-07, + "loss": 1.5292, + "step": 5442 + }, + { + "epoch": 0.3792634916210849, + "grad_norm": 0.7207250460074514, + "learning_rate": 5.016244435246157e-07, + "loss": 1.563, + "step": 5443 + }, + { + "epoch": 0.3793331707487022, + "grad_norm": 0.7791614239074859, + "learning_rate": 5.015603839950547e-07, + "loss": 1.5604, + "step": 5444 + }, + { + "epoch": 0.37940284987631956, + "grad_norm": 0.7509861266693079, + "learning_rate": 5.014963188796808e-07, + "loss": 1.6997, + "step": 5445 + }, + { + "epoch": 0.37947252900393685, + "grad_norm": 0.7855757249315607, + "learning_rate": 5.014322481815643e-07, + "loss": 1.3949, + "step": 5446 + }, + { + "epoch": 0.3795422081315542, + "grad_norm": 0.7625165655696754, + "learning_rate": 5.013681719037753e-07, + "loss": 1.6439, + "step": 5447 + }, + { + "epoch": 0.3796118872591715, + "grad_norm": 0.7524121252006589, + "learning_rate": 5.013040900493848e-07, + "loss": 1.5756, + "step": 5448 + }, + { + "epoch": 0.37968156638678885, + "grad_norm": 0.7053803505432666, + "learning_rate": 5.012400026214633e-07, + "loss": 1.409, + "step": 5449 + }, + { + "epoch": 0.37975124551440614, + "grad_norm": 0.7577344283390356, + "learning_rate": 5.011759096230823e-07, + "loss": 1.6402, + "step": 5450 + }, + { + "epoch": 0.3798209246420235, + "grad_norm": 0.6913724292869309, + "learning_rate": 5.011118110573133e-07, + "loss": 1.5397, + "step": 5451 + }, + { + "epoch": 0.3798906037696408, + "grad_norm": 0.656322013225874, + "learning_rate": 5.010477069272278e-07, + "loss": 1.4744, + "step": 5452 + }, + { + "epoch": 0.37996028289725814, + "grad_norm": 0.6962733401306395, + "learning_rate": 5.009835972358981e-07, + "loss": 1.6503, + "step": 5453 + }, + { + "epoch": 0.38002996202487543, + "grad_norm": 0.7618166591802068, + "learning_rate": 5.009194819863962e-07, + "loss": 1.5865, + "step": 5454 + }, + { + "epoch": 0.3800996411524928, + "grad_norm": 0.7562107443197053, + "learning_rate": 5.008553611817948e-07, + "loss": 1.5157, + "step": 5455 + }, + { + "epoch": 0.3801693202801101, + "grad_norm": 0.7074325420993474, + "learning_rate": 5.007912348251666e-07, + "loss": 1.4497, + "step": 5456 + }, + { + "epoch": 0.38023899940772743, + "grad_norm": 0.7242519999159883, + "learning_rate": 5.007271029195848e-07, + "loss": 1.4, + "step": 5457 + }, + { + "epoch": 0.3803086785353447, + "grad_norm": 0.6342168243414719, + "learning_rate": 5.006629654681224e-07, + "loss": 1.3542, + "step": 5458 + }, + { + "epoch": 0.3803783576629621, + "grad_norm": 0.7126075656711214, + "learning_rate": 5.005988224738535e-07, + "loss": 1.4551, + "step": 5459 + }, + { + "epoch": 0.38044803679057937, + "grad_norm": 0.7270437470510306, + "learning_rate": 5.005346739398517e-07, + "loss": 1.5397, + "step": 5460 + }, + { + "epoch": 0.3805177159181967, + "grad_norm": 0.7624831018762762, + "learning_rate": 5.00470519869191e-07, + "loss": 1.6236, + "step": 5461 + }, + { + "epoch": 0.380587395045814, + "grad_norm": 0.6992172062429126, + "learning_rate": 5.00406360264946e-07, + "loss": 1.4604, + "step": 5462 + }, + { + "epoch": 0.38065707417343136, + "grad_norm": 0.6566970192760084, + "learning_rate": 5.003421951301914e-07, + "loss": 1.4452, + "step": 5463 + }, + { + "epoch": 0.38072675330104866, + "grad_norm": 0.6998814130100979, + "learning_rate": 5.002780244680018e-07, + "loss": 1.5508, + "step": 5464 + }, + { + "epoch": 0.380796432428666, + "grad_norm": 0.7732539204937761, + "learning_rate": 5.002138482814528e-07, + "loss": 1.7181, + "step": 5465 + }, + { + "epoch": 0.3808661115562833, + "grad_norm": 0.6567647406485111, + "learning_rate": 5.001496665736194e-07, + "loss": 1.5205, + "step": 5466 + }, + { + "epoch": 0.38093579068390065, + "grad_norm": 0.7193635427827528, + "learning_rate": 5.000854793475778e-07, + "loss": 1.6246, + "step": 5467 + }, + { + "epoch": 0.38100546981151795, + "grad_norm": 0.6951657989880955, + "learning_rate": 5.000212866064038e-07, + "loss": 1.6509, + "step": 5468 + }, + { + "epoch": 0.3810751489391353, + "grad_norm": 0.7213532599547762, + "learning_rate": 4.999570883531735e-07, + "loss": 1.4985, + "step": 5469 + }, + { + "epoch": 0.3811448280667526, + "grad_norm": 0.754431340305897, + "learning_rate": 4.998928845909635e-07, + "loss": 1.5693, + "step": 5470 + }, + { + "epoch": 0.38121450719436994, + "grad_norm": 0.6989133775278445, + "learning_rate": 4.998286753228507e-07, + "loss": 1.5648, + "step": 5471 + }, + { + "epoch": 0.38128418632198724, + "grad_norm": 0.7707962594955189, + "learning_rate": 4.99764460551912e-07, + "loss": 1.6039, + "step": 5472 + }, + { + "epoch": 0.3813538654496046, + "grad_norm": 0.7054262814931245, + "learning_rate": 4.997002402812248e-07, + "loss": 1.4979, + "step": 5473 + }, + { + "epoch": 0.3814235445772219, + "grad_norm": 0.7334950363402816, + "learning_rate": 4.996360145138664e-07, + "loss": 1.5707, + "step": 5474 + }, + { + "epoch": 0.38149322370483923, + "grad_norm": 0.7876729845840897, + "learning_rate": 4.99571783252915e-07, + "loss": 1.7612, + "step": 5475 + }, + { + "epoch": 0.38156290283245653, + "grad_norm": 0.658733357304968, + "learning_rate": 4.995075465014486e-07, + "loss": 1.4626, + "step": 5476 + }, + { + "epoch": 0.3816325819600739, + "grad_norm": 0.7316933855652641, + "learning_rate": 4.994433042625454e-07, + "loss": 1.5544, + "step": 5477 + }, + { + "epoch": 0.3817022610876912, + "grad_norm": 0.6762053571770205, + "learning_rate": 4.99379056539284e-07, + "loss": 1.4745, + "step": 5478 + }, + { + "epoch": 0.3817719402153085, + "grad_norm": 0.7084236910699324, + "learning_rate": 4.993148033347437e-07, + "loss": 1.5001, + "step": 5479 + }, + { + "epoch": 0.3818416193429258, + "grad_norm": 0.7271727538874175, + "learning_rate": 4.992505446520031e-07, + "loss": 1.5476, + "step": 5480 + }, + { + "epoch": 0.38191129847054317, + "grad_norm": 0.8292686404246161, + "learning_rate": 4.991862804941419e-07, + "loss": 1.5489, + "step": 5481 + }, + { + "epoch": 0.38198097759816046, + "grad_norm": 0.7133473578558076, + "learning_rate": 4.991220108642397e-07, + "loss": 1.5058, + "step": 5482 + }, + { + "epoch": 0.3820506567257778, + "grad_norm": 0.8080700827951963, + "learning_rate": 4.990577357653765e-07, + "loss": 1.566, + "step": 5483 + }, + { + "epoch": 0.3821203358533951, + "grad_norm": 0.6708760556900807, + "learning_rate": 4.989934552006323e-07, + "loss": 1.5028, + "step": 5484 + }, + { + "epoch": 0.38219001498101246, + "grad_norm": 0.7097547017774847, + "learning_rate": 4.989291691730879e-07, + "loss": 1.4198, + "step": 5485 + }, + { + "epoch": 0.38225969410862976, + "grad_norm": 0.7369934710042176, + "learning_rate": 4.988648776858237e-07, + "loss": 1.4532, + "step": 5486 + }, + { + "epoch": 0.3823293732362471, + "grad_norm": 0.6939493609803657, + "learning_rate": 4.98800580741921e-07, + "loss": 1.4274, + "step": 5487 + }, + { + "epoch": 0.3823990523638644, + "grad_norm": 0.6749067587849705, + "learning_rate": 4.987362783444606e-07, + "loss": 1.5632, + "step": 5488 + }, + { + "epoch": 0.38246873149148175, + "grad_norm": 0.6991470780685725, + "learning_rate": 4.986719704965244e-07, + "loss": 1.4716, + "step": 5489 + }, + { + "epoch": 0.38253841061909905, + "grad_norm": 0.7311252973539579, + "learning_rate": 4.98607657201194e-07, + "loss": 1.5337, + "step": 5490 + }, + { + "epoch": 0.3826080897467164, + "grad_norm": 0.746727609531956, + "learning_rate": 4.985433384615513e-07, + "loss": 1.563, + "step": 5491 + }, + { + "epoch": 0.3826777688743337, + "grad_norm": 0.6725022234464829, + "learning_rate": 4.984790142806788e-07, + "loss": 1.4125, + "step": 5492 + }, + { + "epoch": 0.38274744800195104, + "grad_norm": 0.7290834568894429, + "learning_rate": 4.98414684661659e-07, + "loss": 1.4797, + "step": 5493 + }, + { + "epoch": 0.38281712712956834, + "grad_norm": 0.8232159581467848, + "learning_rate": 4.983503496075746e-07, + "loss": 1.5075, + "step": 5494 + }, + { + "epoch": 0.3828868062571857, + "grad_norm": 0.7540441799845059, + "learning_rate": 4.982860091215087e-07, + "loss": 1.6719, + "step": 5495 + }, + { + "epoch": 0.382956485384803, + "grad_norm": 0.6928721393975205, + "learning_rate": 4.982216632065449e-07, + "loss": 1.4631, + "step": 5496 + }, + { + "epoch": 0.38302616451242033, + "grad_norm": 0.6788751573672828, + "learning_rate": 4.981573118657665e-07, + "loss": 1.3759, + "step": 5497 + }, + { + "epoch": 0.3830958436400376, + "grad_norm": 0.7214898466660512, + "learning_rate": 4.980929551022572e-07, + "loss": 1.6067, + "step": 5498 + }, + { + "epoch": 0.383165522767655, + "grad_norm": 0.7308153122274459, + "learning_rate": 4.980285929191015e-07, + "loss": 1.4594, + "step": 5499 + }, + { + "epoch": 0.38323520189527227, + "grad_norm": 0.6670540192625501, + "learning_rate": 4.979642253193835e-07, + "loss": 1.4535, + "step": 5500 + }, + { + "epoch": 0.3833048810228896, + "grad_norm": 0.7395483755910071, + "learning_rate": 4.978998523061879e-07, + "loss": 1.539, + "step": 5501 + }, + { + "epoch": 0.3833745601505069, + "grad_norm": 0.7695363996801382, + "learning_rate": 4.978354738825996e-07, + "loss": 1.4483, + "step": 5502 + }, + { + "epoch": 0.3834442392781242, + "grad_norm": 0.718100408377757, + "learning_rate": 4.977710900517039e-07, + "loss": 1.6256, + "step": 5503 + }, + { + "epoch": 0.38351391840574156, + "grad_norm": 0.69109645152382, + "learning_rate": 4.977067008165859e-07, + "loss": 1.5024, + "step": 5504 + }, + { + "epoch": 0.38358359753335886, + "grad_norm": 0.7116426387946586, + "learning_rate": 4.976423061803315e-07, + "loss": 1.6046, + "step": 5505 + }, + { + "epoch": 0.3836532766609762, + "grad_norm": 0.7852170766850769, + "learning_rate": 4.975779061460264e-07, + "loss": 1.7033, + "step": 5506 + }, + { + "epoch": 0.3837229557885935, + "grad_norm": 0.7689308983734927, + "learning_rate": 4.97513500716757e-07, + "loss": 1.557, + "step": 5507 + }, + { + "epoch": 0.38379263491621085, + "grad_norm": 0.7233222068190561, + "learning_rate": 4.974490898956097e-07, + "loss": 1.5805, + "step": 5508 + }, + { + "epoch": 0.38386231404382815, + "grad_norm": 0.7177217070825982, + "learning_rate": 4.973846736856711e-07, + "loss": 1.5573, + "step": 5509 + }, + { + "epoch": 0.3839319931714455, + "grad_norm": 0.7513702094061874, + "learning_rate": 4.973202520900282e-07, + "loss": 1.5063, + "step": 5510 + }, + { + "epoch": 0.3840016722990628, + "grad_norm": 0.7713853038917127, + "learning_rate": 4.972558251117684e-07, + "loss": 1.5505, + "step": 5511 + }, + { + "epoch": 0.38407135142668014, + "grad_norm": 0.7090530504996623, + "learning_rate": 4.971913927539787e-07, + "loss": 1.4762, + "step": 5512 + }, + { + "epoch": 0.38414103055429744, + "grad_norm": 0.7231100039969972, + "learning_rate": 4.971269550197474e-07, + "loss": 1.5707, + "step": 5513 + }, + { + "epoch": 0.3842107096819148, + "grad_norm": 0.705309164694651, + "learning_rate": 4.970625119121621e-07, + "loss": 1.5038, + "step": 5514 + }, + { + "epoch": 0.3842803888095321, + "grad_norm": 0.6751102981403432, + "learning_rate": 4.969980634343112e-07, + "loss": 1.6372, + "step": 5515 + }, + { + "epoch": 0.38435006793714943, + "grad_norm": 0.7449367266167386, + "learning_rate": 4.969336095892832e-07, + "loss": 1.5536, + "step": 5516 + }, + { + "epoch": 0.3844197470647667, + "grad_norm": 0.8001794393337616, + "learning_rate": 4.968691503801668e-07, + "loss": 1.5196, + "step": 5517 + }, + { + "epoch": 0.3844894261923841, + "grad_norm": 0.8006640674300962, + "learning_rate": 4.968046858100511e-07, + "loss": 1.5362, + "step": 5518 + }, + { + "epoch": 0.38455910532000137, + "grad_norm": 0.7143062739796038, + "learning_rate": 4.967402158820253e-07, + "loss": 1.5016, + "step": 5519 + }, + { + "epoch": 0.3846287844476187, + "grad_norm": 0.7395199641524862, + "learning_rate": 4.966757405991789e-07, + "loss": 1.5336, + "step": 5520 + }, + { + "epoch": 0.384698463575236, + "grad_norm": 0.7173154576633266, + "learning_rate": 4.966112599646018e-07, + "loss": 1.473, + "step": 5521 + }, + { + "epoch": 0.38476814270285337, + "grad_norm": 0.6862498887428835, + "learning_rate": 4.965467739813839e-07, + "loss": 1.5229, + "step": 5522 + }, + { + "epoch": 0.38483782183047066, + "grad_norm": 0.7432309321652748, + "learning_rate": 4.964822826526156e-07, + "loss": 1.4813, + "step": 5523 + }, + { + "epoch": 0.384907500958088, + "grad_norm": 0.744188463216907, + "learning_rate": 4.964177859813877e-07, + "loss": 1.4979, + "step": 5524 + }, + { + "epoch": 0.3849771800857053, + "grad_norm": 0.6818187295241545, + "learning_rate": 4.963532839707905e-07, + "loss": 1.5359, + "step": 5525 + }, + { + "epoch": 0.38504685921332266, + "grad_norm": 0.7178209922409182, + "learning_rate": 4.962887766239153e-07, + "loss": 1.6431, + "step": 5526 + }, + { + "epoch": 0.38511653834093995, + "grad_norm": 0.7332062253347716, + "learning_rate": 4.962242639438536e-07, + "loss": 1.5583, + "step": 5527 + }, + { + "epoch": 0.3851862174685573, + "grad_norm": 0.6889685000442761, + "learning_rate": 4.961597459336968e-07, + "loss": 1.5706, + "step": 5528 + }, + { + "epoch": 0.3852558965961746, + "grad_norm": 0.8052842084542119, + "learning_rate": 4.960952225965369e-07, + "loss": 1.5433, + "step": 5529 + }, + { + "epoch": 0.38532557572379195, + "grad_norm": 0.766408402464758, + "learning_rate": 4.960306939354656e-07, + "loss": 1.5682, + "step": 5530 + }, + { + "epoch": 0.38539525485140924, + "grad_norm": 0.7484794652400308, + "learning_rate": 4.959661599535756e-07, + "loss": 1.5582, + "step": 5531 + }, + { + "epoch": 0.3854649339790266, + "grad_norm": 0.7820263373509483, + "learning_rate": 4.959016206539595e-07, + "loss": 1.7689, + "step": 5532 + }, + { + "epoch": 0.3855346131066439, + "grad_norm": 0.7259786737233994, + "learning_rate": 4.9583707603971e-07, + "loss": 1.6318, + "step": 5533 + }, + { + "epoch": 0.38560429223426124, + "grad_norm": 0.7676985737076506, + "learning_rate": 4.957725261139201e-07, + "loss": 1.6155, + "step": 5534 + }, + { + "epoch": 0.38567397136187853, + "grad_norm": 0.7069887072622391, + "learning_rate": 4.957079708796835e-07, + "loss": 1.3996, + "step": 5535 + }, + { + "epoch": 0.3857436504894959, + "grad_norm": 0.7030376049130003, + "learning_rate": 4.956434103400937e-07, + "loss": 1.5679, + "step": 5536 + }, + { + "epoch": 0.3858133296171132, + "grad_norm": 0.6855415717927498, + "learning_rate": 4.955788444982444e-07, + "loss": 1.5555, + "step": 5537 + }, + { + "epoch": 0.3858830087447305, + "grad_norm": 0.7614205686727051, + "learning_rate": 4.9551427335723e-07, + "loss": 1.6741, + "step": 5538 + }, + { + "epoch": 0.3859526878723478, + "grad_norm": 0.6989562304165375, + "learning_rate": 4.954496969201446e-07, + "loss": 1.6391, + "step": 5539 + }, + { + "epoch": 0.38602236699996517, + "grad_norm": 0.695698775967819, + "learning_rate": 4.953851151900831e-07, + "loss": 1.5328, + "step": 5540 + }, + { + "epoch": 0.38609204612758247, + "grad_norm": 0.7417673904195509, + "learning_rate": 4.953205281701402e-07, + "loss": 1.6011, + "step": 5541 + }, + { + "epoch": 0.3861617252551998, + "grad_norm": 0.6907559477471068, + "learning_rate": 4.95255935863411e-07, + "loss": 1.4323, + "step": 5542 + }, + { + "epoch": 0.3862314043828171, + "grad_norm": 0.6779292317473243, + "learning_rate": 4.951913382729913e-07, + "loss": 1.563, + "step": 5543 + }, + { + "epoch": 0.38630108351043446, + "grad_norm": 0.7723058973909813, + "learning_rate": 4.951267354019762e-07, + "loss": 1.5341, + "step": 5544 + }, + { + "epoch": 0.38637076263805176, + "grad_norm": 0.7137990992337757, + "learning_rate": 4.95062127253462e-07, + "loss": 1.5635, + "step": 5545 + }, + { + "epoch": 0.3864404417656691, + "grad_norm": 0.704918015158124, + "learning_rate": 4.949975138305446e-07, + "loss": 1.4581, + "step": 5546 + }, + { + "epoch": 0.3865101208932864, + "grad_norm": 0.7155688218885253, + "learning_rate": 4.949328951363204e-07, + "loss": 1.5916, + "step": 5547 + }, + { + "epoch": 0.38657980002090375, + "grad_norm": 0.7932583910439622, + "learning_rate": 4.948682711738865e-07, + "loss": 1.5528, + "step": 5548 + }, + { + "epoch": 0.38664947914852105, + "grad_norm": 0.8083960035017587, + "learning_rate": 4.948036419463393e-07, + "loss": 1.5914, + "step": 5549 + }, + { + "epoch": 0.3867191582761384, + "grad_norm": 0.667764867736382, + "learning_rate": 4.947390074567761e-07, + "loss": 1.5833, + "step": 5550 + }, + { + "epoch": 0.3867888374037557, + "grad_norm": 0.7419549291652704, + "learning_rate": 4.946743677082945e-07, + "loss": 1.5124, + "step": 5551 + }, + { + "epoch": 0.38685851653137304, + "grad_norm": 0.6870025872836284, + "learning_rate": 4.946097227039921e-07, + "loss": 1.5114, + "step": 5552 + }, + { + "epoch": 0.38692819565899034, + "grad_norm": 0.6755742379070345, + "learning_rate": 4.945450724469665e-07, + "loss": 1.5062, + "step": 5553 + }, + { + "epoch": 0.3869978747866077, + "grad_norm": 0.7192605944563015, + "learning_rate": 4.944804169403164e-07, + "loss": 1.5299, + "step": 5554 + }, + { + "epoch": 0.387067553914225, + "grad_norm": 0.7074686788750786, + "learning_rate": 4.944157561871397e-07, + "loss": 1.4861, + "step": 5555 + }, + { + "epoch": 0.38713723304184233, + "grad_norm": 0.7303712587132282, + "learning_rate": 4.943510901905356e-07, + "loss": 1.5789, + "step": 5556 + }, + { + "epoch": 0.3872069121694596, + "grad_norm": 0.70092718611266, + "learning_rate": 4.942864189536027e-07, + "loss": 1.5269, + "step": 5557 + }, + { + "epoch": 0.387276591297077, + "grad_norm": 0.7749207884282524, + "learning_rate": 4.942217424794401e-07, + "loss": 1.5306, + "step": 5558 + }, + { + "epoch": 0.3873462704246943, + "grad_norm": 0.7612048856830113, + "learning_rate": 4.941570607711474e-07, + "loss": 1.6048, + "step": 5559 + }, + { + "epoch": 0.3874159495523116, + "grad_norm": 0.7291029938509181, + "learning_rate": 4.940923738318245e-07, + "loss": 1.4984, + "step": 5560 + }, + { + "epoch": 0.3874856286799289, + "grad_norm": 0.6677895182838004, + "learning_rate": 4.940276816645708e-07, + "loss": 1.4915, + "step": 5561 + }, + { + "epoch": 0.38755530780754627, + "grad_norm": 0.7735121442224963, + "learning_rate": 4.939629842724868e-07, + "loss": 1.5627, + "step": 5562 + }, + { + "epoch": 0.38762498693516356, + "grad_norm": 0.7790175853606958, + "learning_rate": 4.93898281658673e-07, + "loss": 1.4801, + "step": 5563 + }, + { + "epoch": 0.3876946660627809, + "grad_norm": 0.7145681800827799, + "learning_rate": 4.9383357382623e-07, + "loss": 1.4691, + "step": 5564 + }, + { + "epoch": 0.3877643451903982, + "grad_norm": 0.7432846857234464, + "learning_rate": 4.937688607782586e-07, + "loss": 1.5028, + "step": 5565 + }, + { + "epoch": 0.38783402431801556, + "grad_norm": 0.7159183915911248, + "learning_rate": 4.937041425178601e-07, + "loss": 1.5037, + "step": 5566 + }, + { + "epoch": 0.38790370344563285, + "grad_norm": 0.7269668278784979, + "learning_rate": 4.936394190481361e-07, + "loss": 1.4085, + "step": 5567 + }, + { + "epoch": 0.3879733825732502, + "grad_norm": 0.6866858671417458, + "learning_rate": 4.93574690372188e-07, + "loss": 1.4342, + "step": 5568 + }, + { + "epoch": 0.3880430617008675, + "grad_norm": 0.7145490114822929, + "learning_rate": 4.935099564931177e-07, + "loss": 1.5746, + "step": 5569 + }, + { + "epoch": 0.38811274082848485, + "grad_norm": 0.7138045405297827, + "learning_rate": 4.934452174140276e-07, + "loss": 1.5896, + "step": 5570 + }, + { + "epoch": 0.38818241995610214, + "grad_norm": 0.7095926528479751, + "learning_rate": 4.933804731380201e-07, + "loss": 1.5292, + "step": 5571 + }, + { + "epoch": 0.3882520990837195, + "grad_norm": 0.7129684749761618, + "learning_rate": 4.933157236681978e-07, + "loss": 1.5525, + "step": 5572 + }, + { + "epoch": 0.3883217782113368, + "grad_norm": 0.7328684505244791, + "learning_rate": 4.932509690076638e-07, + "loss": 1.7226, + "step": 5573 + }, + { + "epoch": 0.38839145733895414, + "grad_norm": 0.7339828615577176, + "learning_rate": 4.931862091595208e-07, + "loss": 1.6035, + "step": 5574 + }, + { + "epoch": 0.38846113646657143, + "grad_norm": 0.6450537560923328, + "learning_rate": 4.931214441268729e-07, + "loss": 1.5124, + "step": 5575 + }, + { + "epoch": 0.3885308155941888, + "grad_norm": 0.7300985149976399, + "learning_rate": 4.930566739128232e-07, + "loss": 1.4279, + "step": 5576 + }, + { + "epoch": 0.3886004947218061, + "grad_norm": 0.6980864377031916, + "learning_rate": 4.92991898520476e-07, + "loss": 1.5553, + "step": 5577 + }, + { + "epoch": 0.38867017384942343, + "grad_norm": 0.7027638402706293, + "learning_rate": 4.929271179529351e-07, + "loss": 1.5512, + "step": 5578 + }, + { + "epoch": 0.3887398529770407, + "grad_norm": 0.7548467168888858, + "learning_rate": 4.928623322133055e-07, + "loss": 1.5324, + "step": 5579 + }, + { + "epoch": 0.3888095321046581, + "grad_norm": 0.7409166404373027, + "learning_rate": 4.927975413046912e-07, + "loss": 1.6207, + "step": 5580 + }, + { + "epoch": 0.38887921123227537, + "grad_norm": 0.6605626587391256, + "learning_rate": 4.927327452301975e-07, + "loss": 1.502, + "step": 5581 + }, + { + "epoch": 0.3889488903598927, + "grad_norm": 0.6880962749504176, + "learning_rate": 4.926679439929295e-07, + "loss": 1.4707, + "step": 5582 + }, + { + "epoch": 0.38901856948751, + "grad_norm": 0.7802737438340347, + "learning_rate": 4.926031375959926e-07, + "loss": 1.5877, + "step": 5583 + }, + { + "epoch": 0.38908824861512736, + "grad_norm": 0.7257829551630415, + "learning_rate": 4.925383260424924e-07, + "loss": 1.5467, + "step": 5584 + }, + { + "epoch": 0.38915792774274466, + "grad_norm": 0.6993590557963282, + "learning_rate": 4.924735093355348e-07, + "loss": 1.5517, + "step": 5585 + }, + { + "epoch": 0.389227606870362, + "grad_norm": 0.7438495635411839, + "learning_rate": 4.92408687478226e-07, + "loss": 1.5874, + "step": 5586 + }, + { + "epoch": 0.3892972859979793, + "grad_norm": 0.733088731946248, + "learning_rate": 4.923438604736725e-07, + "loss": 1.5717, + "step": 5587 + }, + { + "epoch": 0.38936696512559665, + "grad_norm": 0.7083801837424353, + "learning_rate": 4.922790283249808e-07, + "loss": 1.3843, + "step": 5588 + }, + { + "epoch": 0.38943664425321395, + "grad_norm": 0.717862741840073, + "learning_rate": 4.922141910352578e-07, + "loss": 1.5743, + "step": 5589 + }, + { + "epoch": 0.3895063233808313, + "grad_norm": 0.7021056241225214, + "learning_rate": 4.921493486076106e-07, + "loss": 1.4911, + "step": 5590 + }, + { + "epoch": 0.3895760025084486, + "grad_norm": 0.7326892044390457, + "learning_rate": 4.920845010451468e-07, + "loss": 1.5485, + "step": 5591 + }, + { + "epoch": 0.38964568163606594, + "grad_norm": 0.7167763296787694, + "learning_rate": 4.920196483509737e-07, + "loss": 1.5422, + "step": 5592 + }, + { + "epoch": 0.38971536076368324, + "grad_norm": 0.6716757792080215, + "learning_rate": 4.919547905281994e-07, + "loss": 1.6012, + "step": 5593 + }, + { + "epoch": 0.38978503989130053, + "grad_norm": 0.7156461246251185, + "learning_rate": 4.91889927579932e-07, + "loss": 1.5866, + "step": 5594 + }, + { + "epoch": 0.3898547190189179, + "grad_norm": 0.7419782612066664, + "learning_rate": 4.918250595092798e-07, + "loss": 1.4934, + "step": 5595 + }, + { + "epoch": 0.3899243981465352, + "grad_norm": 0.7476547747400746, + "learning_rate": 4.917601863193514e-07, + "loss": 1.6338, + "step": 5596 + }, + { + "epoch": 0.38999407727415253, + "grad_norm": 0.728083014760731, + "learning_rate": 4.916953080132558e-07, + "loss": 1.5645, + "step": 5597 + }, + { + "epoch": 0.3900637564017698, + "grad_norm": 0.7735797984284867, + "learning_rate": 4.916304245941018e-07, + "loss": 1.6132, + "step": 5598 + }, + { + "epoch": 0.3901334355293872, + "grad_norm": 0.7034034881934533, + "learning_rate": 4.915655360649992e-07, + "loss": 1.5384, + "step": 5599 + }, + { + "epoch": 0.39020311465700447, + "grad_norm": 0.7489804917635291, + "learning_rate": 4.915006424290572e-07, + "loss": 1.5392, + "step": 5600 + }, + { + "epoch": 0.3902727937846218, + "grad_norm": 0.7456332541983953, + "learning_rate": 4.914357436893857e-07, + "loss": 1.5891, + "step": 5601 + }, + { + "epoch": 0.3903424729122391, + "grad_norm": 0.7036216749370718, + "learning_rate": 4.91370839849095e-07, + "loss": 1.6004, + "step": 5602 + }, + { + "epoch": 0.39041215203985646, + "grad_norm": 0.7193251436793048, + "learning_rate": 4.913059309112952e-07, + "loss": 1.529, + "step": 5603 + }, + { + "epoch": 0.39048183116747376, + "grad_norm": 0.7729473570738046, + "learning_rate": 4.91241016879097e-07, + "loss": 1.5915, + "step": 5604 + }, + { + "epoch": 0.3905515102950911, + "grad_norm": 0.7334388646784056, + "learning_rate": 4.911760977556112e-07, + "loss": 1.5761, + "step": 5605 + }, + { + "epoch": 0.3906211894227084, + "grad_norm": 0.7335809025498045, + "learning_rate": 4.911111735439487e-07, + "loss": 1.5482, + "step": 5606 + }, + { + "epoch": 0.39069086855032575, + "grad_norm": 0.7207635907762502, + "learning_rate": 4.91046244247221e-07, + "loss": 1.4662, + "step": 5607 + }, + { + "epoch": 0.39076054767794305, + "grad_norm": 0.7091201638463493, + "learning_rate": 4.909813098685395e-07, + "loss": 1.5795, + "step": 5608 + }, + { + "epoch": 0.3908302268055604, + "grad_norm": 0.7013771838761281, + "learning_rate": 4.909163704110161e-07, + "loss": 1.544, + "step": 5609 + }, + { + "epoch": 0.3908999059331777, + "grad_norm": 0.7074049818275138, + "learning_rate": 4.908514258777628e-07, + "loss": 1.6141, + "step": 5610 + }, + { + "epoch": 0.39096958506079504, + "grad_norm": 0.6972315835236457, + "learning_rate": 4.907864762718918e-07, + "loss": 1.4784, + "step": 5611 + }, + { + "epoch": 0.39103926418841234, + "grad_norm": 0.715516245329465, + "learning_rate": 4.907215215965157e-07, + "loss": 1.5359, + "step": 5612 + }, + { + "epoch": 0.3911089433160297, + "grad_norm": 0.7574553672743075, + "learning_rate": 4.906565618547475e-07, + "loss": 1.7375, + "step": 5613 + }, + { + "epoch": 0.391178622443647, + "grad_norm": 0.6502857177607614, + "learning_rate": 4.905915970496996e-07, + "loss": 1.5163, + "step": 5614 + }, + { + "epoch": 0.39124830157126433, + "grad_norm": 0.6938368096085746, + "learning_rate": 4.905266271844857e-07, + "loss": 1.5624, + "step": 5615 + }, + { + "epoch": 0.39131798069888163, + "grad_norm": 0.6712918492602664, + "learning_rate": 4.904616522622193e-07, + "loss": 1.5201, + "step": 5616 + }, + { + "epoch": 0.391387659826499, + "grad_norm": 0.7988195643434735, + "learning_rate": 4.903966722860139e-07, + "loss": 1.5929, + "step": 5617 + }, + { + "epoch": 0.3914573389541163, + "grad_norm": 0.7057183780225315, + "learning_rate": 4.903316872589836e-07, + "loss": 1.648, + "step": 5618 + }, + { + "epoch": 0.3915270180817336, + "grad_norm": 0.6757523132570389, + "learning_rate": 4.902666971842426e-07, + "loss": 1.5514, + "step": 5619 + }, + { + "epoch": 0.3915966972093509, + "grad_norm": 0.6980300975678734, + "learning_rate": 4.902017020649053e-07, + "loss": 1.6303, + "step": 5620 + }, + { + "epoch": 0.39166637633696827, + "grad_norm": 0.7461011434866858, + "learning_rate": 4.901367019040866e-07, + "loss": 1.4192, + "step": 5621 + }, + { + "epoch": 0.39173605546458556, + "grad_norm": 0.7287549638649695, + "learning_rate": 4.900716967049011e-07, + "loss": 1.5044, + "step": 5622 + }, + { + "epoch": 0.3918057345922029, + "grad_norm": 0.6884426807253309, + "learning_rate": 4.900066864704644e-07, + "loss": 1.4788, + "step": 5623 + }, + { + "epoch": 0.3918754137198202, + "grad_norm": 0.698307728524486, + "learning_rate": 4.899416712038918e-07, + "loss": 1.5435, + "step": 5624 + }, + { + "epoch": 0.39194509284743756, + "grad_norm": 0.6789008299948818, + "learning_rate": 4.898766509082986e-07, + "loss": 1.5289, + "step": 5625 + }, + { + "epoch": 0.39201477197505485, + "grad_norm": 0.7970341094255471, + "learning_rate": 4.89811625586801e-07, + "loss": 1.7357, + "step": 5626 + }, + { + "epoch": 0.3920844511026722, + "grad_norm": 0.7188738277514154, + "learning_rate": 4.897465952425153e-07, + "loss": 1.4774, + "step": 5627 + }, + { + "epoch": 0.3921541302302895, + "grad_norm": 0.8373284142960566, + "learning_rate": 4.896815598785576e-07, + "loss": 1.6099, + "step": 5628 + }, + { + "epoch": 0.39222380935790685, + "grad_norm": 0.7088823337013, + "learning_rate": 4.896165194980447e-07, + "loss": 1.529, + "step": 5629 + }, + { + "epoch": 0.39229348848552414, + "grad_norm": 0.7223001653765114, + "learning_rate": 4.895514741040933e-07, + "loss": 1.5641, + "step": 5630 + }, + { + "epoch": 0.3923631676131415, + "grad_norm": 0.6914220741166752, + "learning_rate": 4.894864236998208e-07, + "loss": 1.5283, + "step": 5631 + }, + { + "epoch": 0.3924328467407588, + "grad_norm": 0.7147757332932779, + "learning_rate": 4.894213682883443e-07, + "loss": 1.521, + "step": 5632 + }, + { + "epoch": 0.39250252586837614, + "grad_norm": 0.764680925124264, + "learning_rate": 4.893563078727815e-07, + "loss": 1.5515, + "step": 5633 + }, + { + "epoch": 0.39257220499599343, + "grad_norm": 0.7822852931923628, + "learning_rate": 4.892912424562501e-07, + "loss": 1.5652, + "step": 5634 + }, + { + "epoch": 0.3926418841236108, + "grad_norm": 0.7929478109855593, + "learning_rate": 4.892261720418682e-07, + "loss": 1.7101, + "step": 5635 + }, + { + "epoch": 0.3927115632512281, + "grad_norm": 0.7218548026309808, + "learning_rate": 4.891610966327543e-07, + "loss": 1.652, + "step": 5636 + }, + { + "epoch": 0.39278124237884543, + "grad_norm": 0.7335248098012389, + "learning_rate": 4.890960162320267e-07, + "loss": 1.5682, + "step": 5637 + }, + { + "epoch": 0.3928509215064627, + "grad_norm": 0.7143306188937307, + "learning_rate": 4.890309308428044e-07, + "loss": 1.5508, + "step": 5638 + }, + { + "epoch": 0.3929206006340801, + "grad_norm": 0.6688063423914016, + "learning_rate": 4.889658404682062e-07, + "loss": 1.5865, + "step": 5639 + }, + { + "epoch": 0.39299027976169737, + "grad_norm": 0.6915994442437262, + "learning_rate": 4.889007451113515e-07, + "loss": 1.4609, + "step": 5640 + }, + { + "epoch": 0.3930599588893147, + "grad_norm": 0.7061342396943753, + "learning_rate": 4.8883564477536e-07, + "loss": 1.5257, + "step": 5641 + }, + { + "epoch": 0.393129638016932, + "grad_norm": 0.7330023613356081, + "learning_rate": 4.88770539463351e-07, + "loss": 1.5863, + "step": 5642 + }, + { + "epoch": 0.39319931714454937, + "grad_norm": 0.6569236724030605, + "learning_rate": 4.887054291784448e-07, + "loss": 1.5345, + "step": 5643 + }, + { + "epoch": 0.39326899627216666, + "grad_norm": 0.6997962715930213, + "learning_rate": 4.886403139237615e-07, + "loss": 1.6101, + "step": 5644 + }, + { + "epoch": 0.393338675399784, + "grad_norm": 0.6660182692895334, + "learning_rate": 4.885751937024216e-07, + "loss": 1.4714, + "step": 5645 + }, + { + "epoch": 0.3934083545274013, + "grad_norm": 0.7775772452706329, + "learning_rate": 4.885100685175459e-07, + "loss": 1.6419, + "step": 5646 + }, + { + "epoch": 0.39347803365501866, + "grad_norm": 0.7940753653334305, + "learning_rate": 4.88444938372255e-07, + "loss": 1.4888, + "step": 5647 + }, + { + "epoch": 0.39354771278263595, + "grad_norm": 0.6889526655071125, + "learning_rate": 4.883798032696704e-07, + "loss": 1.5522, + "step": 5648 + }, + { + "epoch": 0.3936173919102533, + "grad_norm": 0.7216018333181632, + "learning_rate": 4.883146632129135e-07, + "loss": 1.424, + "step": 5649 + }, + { + "epoch": 0.3936870710378706, + "grad_norm": 0.7383631566837524, + "learning_rate": 4.882495182051056e-07, + "loss": 1.5577, + "step": 5650 + }, + { + "epoch": 0.39375675016548795, + "grad_norm": 0.6910122209800914, + "learning_rate": 4.88184368249369e-07, + "loss": 1.651, + "step": 5651 + }, + { + "epoch": 0.39382642929310524, + "grad_norm": 0.7071475670558016, + "learning_rate": 4.881192133488256e-07, + "loss": 1.4289, + "step": 5652 + }, + { + "epoch": 0.3938961084207226, + "grad_norm": 0.7940647359092496, + "learning_rate": 4.880540535065978e-07, + "loss": 1.5281, + "step": 5653 + }, + { + "epoch": 0.3939657875483399, + "grad_norm": 0.7424024603209955, + "learning_rate": 4.879888887258082e-07, + "loss": 1.7073, + "step": 5654 + }, + { + "epoch": 0.39403546667595724, + "grad_norm": 0.7040686230275012, + "learning_rate": 4.879237190095795e-07, + "loss": 1.6446, + "step": 5655 + }, + { + "epoch": 0.39410514580357453, + "grad_norm": 0.6804339815316579, + "learning_rate": 4.878585443610351e-07, + "loss": 1.5443, + "step": 5656 + }, + { + "epoch": 0.3941748249311919, + "grad_norm": 0.7694443687860312, + "learning_rate": 4.877933647832978e-07, + "loss": 1.6059, + "step": 5657 + }, + { + "epoch": 0.3942445040588092, + "grad_norm": 0.7095308678114916, + "learning_rate": 4.877281802794917e-07, + "loss": 1.5783, + "step": 5658 + }, + { + "epoch": 0.3943141831864265, + "grad_norm": 0.6834728695375323, + "learning_rate": 4.876629908527402e-07, + "loss": 1.4167, + "step": 5659 + }, + { + "epoch": 0.3943838623140438, + "grad_norm": 0.732453498933405, + "learning_rate": 4.875977965061674e-07, + "loss": 1.5803, + "step": 5660 + }, + { + "epoch": 0.39445354144166117, + "grad_norm": 0.8493170677261569, + "learning_rate": 4.875325972428976e-07, + "loss": 1.5688, + "step": 5661 + }, + { + "epoch": 0.39452322056927847, + "grad_norm": 0.7366435447689659, + "learning_rate": 4.874673930660551e-07, + "loss": 1.6815, + "step": 5662 + }, + { + "epoch": 0.3945928996968958, + "grad_norm": 0.6972247175402578, + "learning_rate": 4.87402183978765e-07, + "loss": 1.3911, + "step": 5663 + }, + { + "epoch": 0.3946625788245131, + "grad_norm": 0.6724755543519445, + "learning_rate": 4.87336969984152e-07, + "loss": 1.553, + "step": 5664 + }, + { + "epoch": 0.39473225795213046, + "grad_norm": 0.7443101644041815, + "learning_rate": 4.872717510853411e-07, + "loss": 1.4836, + "step": 5665 + }, + { + "epoch": 0.39480193707974776, + "grad_norm": 0.7105185151933725, + "learning_rate": 4.872065272854581e-07, + "loss": 1.604, + "step": 5666 + }, + { + "epoch": 0.3948716162073651, + "grad_norm": 0.7224894163248697, + "learning_rate": 4.871412985876283e-07, + "loss": 1.5213, + "step": 5667 + }, + { + "epoch": 0.3949412953349824, + "grad_norm": 0.7109761043079036, + "learning_rate": 4.87076064994978e-07, + "loss": 1.5168, + "step": 5668 + }, + { + "epoch": 0.39501097446259975, + "grad_norm": 0.8068347274295018, + "learning_rate": 4.870108265106329e-07, + "loss": 1.7317, + "step": 5669 + }, + { + "epoch": 0.39508065359021705, + "grad_norm": 0.7051206301189206, + "learning_rate": 4.869455831377196e-07, + "loss": 1.5398, + "step": 5670 + }, + { + "epoch": 0.3951503327178344, + "grad_norm": 0.7227348729953125, + "learning_rate": 4.868803348793649e-07, + "loss": 1.5696, + "step": 5671 + }, + { + "epoch": 0.3952200118454517, + "grad_norm": 0.7597406001763358, + "learning_rate": 4.868150817386952e-07, + "loss": 1.4969, + "step": 5672 + }, + { + "epoch": 0.39528969097306904, + "grad_norm": 0.8029844792030686, + "learning_rate": 4.867498237188378e-07, + "loss": 1.5866, + "step": 5673 + }, + { + "epoch": 0.39535937010068634, + "grad_norm": 0.7330628123186286, + "learning_rate": 4.8668456082292e-07, + "loss": 1.5821, + "step": 5674 + }, + { + "epoch": 0.3954290492283037, + "grad_norm": 0.7313312062625753, + "learning_rate": 4.866192930540692e-07, + "loss": 1.6279, + "step": 5675 + }, + { + "epoch": 0.395498728355921, + "grad_norm": 0.8043239241069365, + "learning_rate": 4.865540204154133e-07, + "loss": 1.544, + "step": 5676 + }, + { + "epoch": 0.39556840748353833, + "grad_norm": 0.7187432972009813, + "learning_rate": 4.864887429100803e-07, + "loss": 1.4934, + "step": 5677 + }, + { + "epoch": 0.3956380866111556, + "grad_norm": 0.7673309697309961, + "learning_rate": 4.864234605411983e-07, + "loss": 1.5899, + "step": 5678 + }, + { + "epoch": 0.395707765738773, + "grad_norm": 0.7442189649830544, + "learning_rate": 4.86358173311896e-07, + "loss": 1.5012, + "step": 5679 + }, + { + "epoch": 0.39577744486639027, + "grad_norm": 0.7633515183291592, + "learning_rate": 4.862928812253018e-07, + "loss": 1.6551, + "step": 5680 + }, + { + "epoch": 0.3958471239940076, + "grad_norm": 0.6976117552063694, + "learning_rate": 4.862275842845448e-07, + "loss": 1.5045, + "step": 5681 + }, + { + "epoch": 0.3959168031216249, + "grad_norm": 0.7490951654378956, + "learning_rate": 4.861622824927543e-07, + "loss": 1.5752, + "step": 5682 + }, + { + "epoch": 0.39598648224924227, + "grad_norm": 0.6767327539185277, + "learning_rate": 4.860969758530593e-07, + "loss": 1.497, + "step": 5683 + }, + { + "epoch": 0.39605616137685956, + "grad_norm": 0.6841274638774705, + "learning_rate": 4.860316643685898e-07, + "loss": 1.5006, + "step": 5684 + }, + { + "epoch": 0.3961258405044769, + "grad_norm": 0.7178087195736019, + "learning_rate": 4.859663480424755e-07, + "loss": 1.6026, + "step": 5685 + }, + { + "epoch": 0.3961955196320942, + "grad_norm": 0.7031296552917827, + "learning_rate": 4.859010268778465e-07, + "loss": 1.6191, + "step": 5686 + }, + { + "epoch": 0.3962651987597115, + "grad_norm": 0.70942369444354, + "learning_rate": 4.858357008778333e-07, + "loss": 1.6875, + "step": 5687 + }, + { + "epoch": 0.39633487788732885, + "grad_norm": 0.7593496749721073, + "learning_rate": 4.857703700455662e-07, + "loss": 1.4996, + "step": 5688 + }, + { + "epoch": 0.39640455701494615, + "grad_norm": 0.7200832749609688, + "learning_rate": 4.85705034384176e-07, + "loss": 1.5551, + "step": 5689 + }, + { + "epoch": 0.3964742361425635, + "grad_norm": 0.6920762660051075, + "learning_rate": 4.856396938967939e-07, + "loss": 1.5067, + "step": 5690 + }, + { + "epoch": 0.3965439152701808, + "grad_norm": 0.7669411676412027, + "learning_rate": 4.855743485865511e-07, + "loss": 1.4865, + "step": 5691 + }, + { + "epoch": 0.39661359439779814, + "grad_norm": 0.7247852762545683, + "learning_rate": 4.85508998456579e-07, + "loss": 1.5194, + "step": 5692 + }, + { + "epoch": 0.39668327352541544, + "grad_norm": 0.6844608804865571, + "learning_rate": 4.854436435100093e-07, + "loss": 1.5579, + "step": 5693 + }, + { + "epoch": 0.3967529526530328, + "grad_norm": 0.8003618335097139, + "learning_rate": 4.85378283749974e-07, + "loss": 1.5606, + "step": 5694 + }, + { + "epoch": 0.3968226317806501, + "grad_norm": 0.6849811665507215, + "learning_rate": 4.853129191796053e-07, + "loss": 1.3802, + "step": 5695 + }, + { + "epoch": 0.39689231090826743, + "grad_norm": 0.6944625667383478, + "learning_rate": 4.852475498020355e-07, + "loss": 1.4661, + "step": 5696 + }, + { + "epoch": 0.3969619900358847, + "grad_norm": 0.7215389063076278, + "learning_rate": 4.851821756203975e-07, + "loss": 1.5935, + "step": 5697 + }, + { + "epoch": 0.3970316691635021, + "grad_norm": 0.7034503235896736, + "learning_rate": 4.851167966378238e-07, + "loss": 1.6066, + "step": 5698 + }, + { + "epoch": 0.39710134829111937, + "grad_norm": 0.7325183000161554, + "learning_rate": 4.850514128574478e-07, + "loss": 1.5436, + "step": 5699 + }, + { + "epoch": 0.3971710274187367, + "grad_norm": 0.7186466732509661, + "learning_rate": 4.849860242824026e-07, + "loss": 1.4569, + "step": 5700 + }, + { + "epoch": 0.397240706546354, + "grad_norm": 0.69751502384842, + "learning_rate": 4.84920630915822e-07, + "loss": 1.4414, + "step": 5701 + }, + { + "epoch": 0.39731038567397137, + "grad_norm": 0.7034771671737267, + "learning_rate": 4.848552327608393e-07, + "loss": 1.4511, + "step": 5702 + }, + { + "epoch": 0.39738006480158866, + "grad_norm": 0.7062965342234896, + "learning_rate": 4.847898298205892e-07, + "loss": 1.5185, + "step": 5703 + }, + { + "epoch": 0.397449743929206, + "grad_norm": 0.7433825843030339, + "learning_rate": 4.847244220982053e-07, + "loss": 1.5063, + "step": 5704 + }, + { + "epoch": 0.3975194230568233, + "grad_norm": 1.0650085692078823, + "learning_rate": 4.846590095968226e-07, + "loss": 1.5949, + "step": 5705 + }, + { + "epoch": 0.39758910218444066, + "grad_norm": 0.7328515507980331, + "learning_rate": 4.845935923195755e-07, + "loss": 1.543, + "step": 5706 + }, + { + "epoch": 0.39765878131205795, + "grad_norm": 0.7022710312029437, + "learning_rate": 4.845281702695989e-07, + "loss": 1.6136, + "step": 5707 + }, + { + "epoch": 0.3977284604396753, + "grad_norm": 0.712743673405011, + "learning_rate": 4.844627434500282e-07, + "loss": 1.4946, + "step": 5708 + }, + { + "epoch": 0.3977981395672926, + "grad_norm": 0.7208989189265302, + "learning_rate": 4.843973118639986e-07, + "loss": 1.5949, + "step": 5709 + }, + { + "epoch": 0.39786781869490995, + "grad_norm": 0.6990871375396196, + "learning_rate": 4.843318755146456e-07, + "loss": 1.5536, + "step": 5710 + }, + { + "epoch": 0.39793749782252724, + "grad_norm": 0.7238622642027562, + "learning_rate": 4.842664344051053e-07, + "loss": 1.4782, + "step": 5711 + }, + { + "epoch": 0.3980071769501446, + "grad_norm": 0.7130800444416632, + "learning_rate": 4.842009885385137e-07, + "loss": 1.4871, + "step": 5712 + }, + { + "epoch": 0.3980768560777619, + "grad_norm": 0.660704364407987, + "learning_rate": 4.841355379180071e-07, + "loss": 1.4377, + "step": 5713 + }, + { + "epoch": 0.39814653520537924, + "grad_norm": 0.7016372786912725, + "learning_rate": 4.840700825467219e-07, + "loss": 1.5465, + "step": 5714 + }, + { + "epoch": 0.39821621433299653, + "grad_norm": 0.8233394310985324, + "learning_rate": 4.84004622427795e-07, + "loss": 1.5596, + "step": 5715 + }, + { + "epoch": 0.3982858934606139, + "grad_norm": 0.7431768553667599, + "learning_rate": 4.839391575643634e-07, + "loss": 1.5784, + "step": 5716 + }, + { + "epoch": 0.3983555725882312, + "grad_norm": 0.7286515546228858, + "learning_rate": 4.838736879595643e-07, + "loss": 1.5407, + "step": 5717 + }, + { + "epoch": 0.3984252517158485, + "grad_norm": 0.6803185223474442, + "learning_rate": 4.838082136165349e-07, + "loss": 1.5144, + "step": 5718 + }, + { + "epoch": 0.3984949308434658, + "grad_norm": 0.729071303691758, + "learning_rate": 4.837427345384132e-07, + "loss": 1.4729, + "step": 5719 + }, + { + "epoch": 0.3985646099710832, + "grad_norm": 0.7505050057733356, + "learning_rate": 4.836772507283369e-07, + "loss": 1.6377, + "step": 5720 + }, + { + "epoch": 0.39863428909870047, + "grad_norm": 0.6596317365173326, + "learning_rate": 4.836117621894442e-07, + "loss": 1.4454, + "step": 5721 + }, + { + "epoch": 0.3987039682263178, + "grad_norm": 0.747012650690718, + "learning_rate": 4.835462689248733e-07, + "loss": 1.4649, + "step": 5722 + }, + { + "epoch": 0.3987736473539351, + "grad_norm": 0.7566351309737823, + "learning_rate": 4.83480770937763e-07, + "loss": 1.6035, + "step": 5723 + }, + { + "epoch": 0.39884332648155246, + "grad_norm": 0.7442740560150342, + "learning_rate": 4.83415268231252e-07, + "loss": 1.4965, + "step": 5724 + }, + { + "epoch": 0.39891300560916976, + "grad_norm": 0.7856195341062227, + "learning_rate": 4.833497608084793e-07, + "loss": 1.5433, + "step": 5725 + }, + { + "epoch": 0.3989826847367871, + "grad_norm": 0.69840242548124, + "learning_rate": 4.83284248672584e-07, + "loss": 1.5195, + "step": 5726 + }, + { + "epoch": 0.3990523638644044, + "grad_norm": 0.6809083026173022, + "learning_rate": 4.832187318267059e-07, + "loss": 1.4451, + "step": 5727 + }, + { + "epoch": 0.39912204299202175, + "grad_norm": 0.6750686878569807, + "learning_rate": 4.831532102739844e-07, + "loss": 1.5211, + "step": 5728 + }, + { + "epoch": 0.39919172211963905, + "grad_norm": 0.7334443597087285, + "learning_rate": 4.830876840175596e-07, + "loss": 1.6082, + "step": 5729 + }, + { + "epoch": 0.3992614012472564, + "grad_norm": 0.68852786930121, + "learning_rate": 4.830221530605716e-07, + "loss": 1.6109, + "step": 5730 + }, + { + "epoch": 0.3993310803748737, + "grad_norm": 0.7544369302712011, + "learning_rate": 4.829566174061609e-07, + "loss": 1.5038, + "step": 5731 + }, + { + "epoch": 0.39940075950249104, + "grad_norm": 0.7049817535947259, + "learning_rate": 4.828910770574679e-07, + "loss": 1.5423, + "step": 5732 + }, + { + "epoch": 0.39947043863010834, + "grad_norm": 0.7385275567962578, + "learning_rate": 4.828255320176336e-07, + "loss": 1.5723, + "step": 5733 + }, + { + "epoch": 0.3995401177577257, + "grad_norm": 0.759768274010817, + "learning_rate": 4.827599822897988e-07, + "loss": 1.5713, + "step": 5734 + }, + { + "epoch": 0.399609796885343, + "grad_norm": 0.8286752447322427, + "learning_rate": 4.826944278771051e-07, + "loss": 1.6021, + "step": 5735 + }, + { + "epoch": 0.39967947601296033, + "grad_norm": 0.7238465828951179, + "learning_rate": 4.826288687826939e-07, + "loss": 1.6626, + "step": 5736 + }, + { + "epoch": 0.39974915514057763, + "grad_norm": 0.7770751932847518, + "learning_rate": 4.825633050097067e-07, + "loss": 1.6797, + "step": 5737 + }, + { + "epoch": 0.399818834268195, + "grad_norm": 0.7711030533066128, + "learning_rate": 4.824977365612857e-07, + "loss": 1.6196, + "step": 5738 + }, + { + "epoch": 0.3998885133958123, + "grad_norm": 0.6875827946985658, + "learning_rate": 4.82432163440573e-07, + "loss": 1.5406, + "step": 5739 + }, + { + "epoch": 0.3999581925234296, + "grad_norm": 0.7335197989377081, + "learning_rate": 4.82366585650711e-07, + "loss": 1.6377, + "step": 5740 + }, + { + "epoch": 0.4000278716510469, + "grad_norm": 0.6941435526661424, + "learning_rate": 4.823010031948425e-07, + "loss": 1.6344, + "step": 5741 + }, + { + "epoch": 0.40009755077866427, + "grad_norm": 0.6797310835665473, + "learning_rate": 4.8223541607611e-07, + "loss": 1.4352, + "step": 5742 + } + ], + "logging_steps": 1, + "max_steps": 14351, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 2871, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3010137962840064.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}