diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4809 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9992652461425422, + "eval_steps": 250, + "global_step": 680, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0014695077149155032, + "grad_norm": 11.337424189328551, + "learning_rate": 4.7619047619047613e-08, + "loss": 1.5474, + "step": 1 + }, + { + "epoch": 0.0029390154298310064, + "grad_norm": 11.09462136805898, + "learning_rate": 9.523809523809523e-08, + "loss": 1.5705, + "step": 2 + }, + { + "epoch": 0.00440852314474651, + "grad_norm": 11.058506611133362, + "learning_rate": 1.4285714285714285e-07, + "loss": 1.4847, + "step": 3 + }, + { + "epoch": 0.005878030859662013, + "grad_norm": 10.9831239658005, + "learning_rate": 1.9047619047619045e-07, + "loss": 1.5536, + "step": 4 + }, + { + "epoch": 0.0073475385745775165, + "grad_norm": 10.8860240443791, + "learning_rate": 2.3809523809523806e-07, + "loss": 1.5523, + "step": 5 + }, + { + "epoch": 0.00881704628949302, + "grad_norm": 12.310500944697788, + "learning_rate": 2.857142857142857e-07, + "loss": 1.5543, + "step": 6 + }, + { + "epoch": 0.010286554004408524, + "grad_norm": 10.339190259363582, + "learning_rate": 3.333333333333333e-07, + "loss": 1.5789, + "step": 7 + }, + { + "epoch": 0.011756061719324026, + "grad_norm": 11.75096794982965, + "learning_rate": 3.809523809523809e-07, + "loss": 1.4619, + "step": 8 + }, + { + "epoch": 0.01322556943423953, + "grad_norm": 10.983629821806812, + "learning_rate": 4.285714285714285e-07, + "loss": 1.5461, + "step": 9 + }, + { + "epoch": 0.014695077149155033, + "grad_norm": 10.026453006105893, + "learning_rate": 4.761904761904761e-07, + "loss": 1.594, + "step": 10 + }, + { + "epoch": 0.016164584864070537, + "grad_norm": 9.2515708413017, + "learning_rate": 5.238095238095238e-07, + "loss": 1.4904, + "step": 11 + }, + { + "epoch": 0.01763409257898604, + "grad_norm": 8.8513471084721, + "learning_rate": 5.714285714285714e-07, + "loss": 1.5626, + "step": 12 + }, + { + "epoch": 0.019103600293901544, + "grad_norm": 9.013166716897066, + "learning_rate": 6.19047619047619e-07, + "loss": 1.5522, + "step": 13 + }, + { + "epoch": 0.020573108008817047, + "grad_norm": 8.149687442441557, + "learning_rate": 6.666666666666666e-07, + "loss": 1.5075, + "step": 14 + }, + { + "epoch": 0.02204261572373255, + "grad_norm": 6.880717564693127, + "learning_rate": 7.142857142857143e-07, + "loss": 1.538, + "step": 15 + }, + { + "epoch": 0.02351212343864805, + "grad_norm": 6.545463429781402, + "learning_rate": 7.619047619047618e-07, + "loss": 1.5044, + "step": 16 + }, + { + "epoch": 0.024981631153563555, + "grad_norm": 6.093488294374897, + "learning_rate": 8.095238095238095e-07, + "loss": 1.4779, + "step": 17 + }, + { + "epoch": 0.02645113886847906, + "grad_norm": 6.4020806034329425, + "learning_rate": 8.57142857142857e-07, + "loss": 1.4431, + "step": 18 + }, + { + "epoch": 0.027920646583394562, + "grad_norm": 6.142908310782321, + "learning_rate": 9.047619047619047e-07, + "loss": 1.5965, + "step": 19 + }, + { + "epoch": 0.029390154298310066, + "grad_norm": 5.136583641076265, + "learning_rate": 9.523809523809522e-07, + "loss": 1.3968, + "step": 20 + }, + { + "epoch": 0.03085966201322557, + "grad_norm": 5.02265599787556, + "learning_rate": 1e-06, + "loss": 1.4007, + "step": 21 + }, + { + "epoch": 0.03232916972814107, + "grad_norm": 4.976860924734495, + "learning_rate": 9.999943184333936e-07, + "loss": 1.4378, + "step": 22 + }, + { + "epoch": 0.03379867744305658, + "grad_norm": 4.89918762254008, + "learning_rate": 9.999772738626954e-07, + "loss": 1.4341, + "step": 23 + }, + { + "epoch": 0.03526818515797208, + "grad_norm": 4.659631180053279, + "learning_rate": 9.999488666752648e-07, + "loss": 1.4178, + "step": 24 + }, + { + "epoch": 0.036737692872887584, + "grad_norm": 4.542979691946772, + "learning_rate": 9.99909097516691e-07, + "loss": 1.4933, + "step": 25 + }, + { + "epoch": 0.03820720058780309, + "grad_norm": 4.342835973040556, + "learning_rate": 9.998579672907788e-07, + "loss": 1.4417, + "step": 26 + }, + { + "epoch": 0.03967670830271859, + "grad_norm": 3.2871765127170853, + "learning_rate": 9.99795477159527e-07, + "loss": 1.423, + "step": 27 + }, + { + "epoch": 0.041146216017634095, + "grad_norm": 3.2008427389003766, + "learning_rate": 9.99721628543103e-07, + "loss": 1.3282, + "step": 28 + }, + { + "epoch": 0.0426157237325496, + "grad_norm": 3.1114560975594836, + "learning_rate": 9.996364231198103e-07, + "loss": 1.3891, + "step": 29 + }, + { + "epoch": 0.0440852314474651, + "grad_norm": 2.9307530666626884, + "learning_rate": 9.9953986282605e-07, + "loss": 1.3589, + "step": 30 + }, + { + "epoch": 0.045554739162380606, + "grad_norm": 2.9544258327932, + "learning_rate": 9.99431949856277e-07, + "loss": 1.3487, + "step": 31 + }, + { + "epoch": 0.0470242468772961, + "grad_norm": 2.9071124300801565, + "learning_rate": 9.9931268666295e-07, + "loss": 1.3462, + "step": 32 + }, + { + "epoch": 0.048493754592211606, + "grad_norm": 2.673673317642893, + "learning_rate": 9.991820759564766e-07, + "loss": 1.3972, + "step": 33 + }, + { + "epoch": 0.04996326230712711, + "grad_norm": 2.746698468930808, + "learning_rate": 9.990401207051504e-07, + "loss": 1.2998, + "step": 34 + }, + { + "epoch": 0.05143277002204261, + "grad_norm": 2.699798149588662, + "learning_rate": 9.98886824135084e-07, + "loss": 1.3451, + "step": 35 + }, + { + "epoch": 0.05290227773695812, + "grad_norm": 2.7302270842982987, + "learning_rate": 9.98722189730136e-07, + "loss": 1.392, + "step": 36 + }, + { + "epoch": 0.05437178545187362, + "grad_norm": 2.535544436400141, + "learning_rate": 9.985462212318322e-07, + "loss": 1.3673, + "step": 37 + }, + { + "epoch": 0.055841293166789124, + "grad_norm": 2.5536809032653145, + "learning_rate": 9.983589226392792e-07, + "loss": 1.2696, + "step": 38 + }, + { + "epoch": 0.05731080088170463, + "grad_norm": 2.438242871155347, + "learning_rate": 9.98160298209075e-07, + "loss": 1.3818, + "step": 39 + }, + { + "epoch": 0.05878030859662013, + "grad_norm": 2.2111464394984814, + "learning_rate": 9.97950352455211e-07, + "loss": 1.2618, + "step": 40 + }, + { + "epoch": 0.060249816311535635, + "grad_norm": 2.0951995475252856, + "learning_rate": 9.977290901489707e-07, + "loss": 1.2963, + "step": 41 + }, + { + "epoch": 0.06171932402645114, + "grad_norm": 2.1449537100989846, + "learning_rate": 9.9749651631882e-07, + "loss": 1.2525, + "step": 42 + }, + { + "epoch": 0.06318883174136664, + "grad_norm": 2.617528852412968, + "learning_rate": 9.972526362502937e-07, + "loss": 1.3144, + "step": 43 + }, + { + "epoch": 0.06465833945628215, + "grad_norm": 2.210234169368085, + "learning_rate": 9.969974554858754e-07, + "loss": 1.2379, + "step": 44 + }, + { + "epoch": 0.06612784717119764, + "grad_norm": 2.42457485573731, + "learning_rate": 9.967309798248707e-07, + "loss": 1.2573, + "step": 45 + }, + { + "epoch": 0.06759735488611315, + "grad_norm": 2.0254014234209405, + "learning_rate": 9.96453215323277e-07, + "loss": 1.2536, + "step": 46 + }, + { + "epoch": 0.06906686260102865, + "grad_norm": 2.0725386643920958, + "learning_rate": 9.961641682936442e-07, + "loss": 1.2687, + "step": 47 + }, + { + "epoch": 0.07053637031594416, + "grad_norm": 1.8563249903888095, + "learning_rate": 9.95863845304932e-07, + "loss": 1.2335, + "step": 48 + }, + { + "epoch": 0.07200587803085966, + "grad_norm": 1.9548577428039762, + "learning_rate": 9.955522531823606e-07, + "loss": 1.3056, + "step": 49 + }, + { + "epoch": 0.07347538574577517, + "grad_norm": 1.855842269546498, + "learning_rate": 9.952293990072557e-07, + "loss": 1.2671, + "step": 50 + }, + { + "epoch": 0.07494489346069066, + "grad_norm": 1.7972312223565392, + "learning_rate": 9.948952901168874e-07, + "loss": 1.2718, + "step": 51 + }, + { + "epoch": 0.07641440117560618, + "grad_norm": 1.6423712015458758, + "learning_rate": 9.945499341043033e-07, + "loss": 1.2043, + "step": 52 + }, + { + "epoch": 0.07788390889052167, + "grad_norm": 1.6028195930568658, + "learning_rate": 9.94193338818156e-07, + "loss": 1.2136, + "step": 53 + }, + { + "epoch": 0.07935341660543718, + "grad_norm": 1.5486505262354593, + "learning_rate": 9.938255123625251e-07, + "loss": 1.2556, + "step": 54 + }, + { + "epoch": 0.08082292432035268, + "grad_norm": 1.5343630961919048, + "learning_rate": 9.934464630967328e-07, + "loss": 1.2519, + "step": 55 + }, + { + "epoch": 0.08229243203526819, + "grad_norm": 1.662816938451208, + "learning_rate": 9.930561996351533e-07, + "loss": 1.2482, + "step": 56 + }, + { + "epoch": 0.08376193975018369, + "grad_norm": 1.5931246608007381, + "learning_rate": 9.926547308470183e-07, + "loss": 1.1625, + "step": 57 + }, + { + "epoch": 0.0852314474650992, + "grad_norm": 1.5447707955729764, + "learning_rate": 9.922420658562144e-07, + "loss": 1.1873, + "step": 58 + }, + { + "epoch": 0.0867009551800147, + "grad_norm": 1.644709907245824, + "learning_rate": 9.91818214041076e-07, + "loss": 1.1914, + "step": 59 + }, + { + "epoch": 0.0881704628949302, + "grad_norm": 1.6366535743931827, + "learning_rate": 9.913831850341725e-07, + "loss": 1.1871, + "step": 60 + }, + { + "epoch": 0.0896399706098457, + "grad_norm": 1.5571790849844254, + "learning_rate": 9.90936988722089e-07, + "loss": 1.2605, + "step": 61 + }, + { + "epoch": 0.09110947832476121, + "grad_norm": 3.7097540672301386, + "learning_rate": 9.904796352452019e-07, + "loss": 1.1212, + "step": 62 + }, + { + "epoch": 0.09257898603967671, + "grad_norm": 1.5517830620847208, + "learning_rate": 9.900111349974478e-07, + "loss": 1.1623, + "step": 63 + }, + { + "epoch": 0.0940484937545922, + "grad_norm": 1.5680093555121737, + "learning_rate": 9.895314986260886e-07, + "loss": 1.1552, + "step": 64 + }, + { + "epoch": 0.09551800146950772, + "grad_norm": 1.6209106318425315, + "learning_rate": 9.890407370314677e-07, + "loss": 1.1337, + "step": 65 + }, + { + "epoch": 0.09698750918442321, + "grad_norm": 1.4100673270240494, + "learning_rate": 9.885388613667644e-07, + "loss": 1.2156, + "step": 66 + }, + { + "epoch": 0.09845701689933872, + "grad_norm": 1.5012099159985126, + "learning_rate": 9.880258830377386e-07, + "loss": 1.2427, + "step": 67 + }, + { + "epoch": 0.09992652461425422, + "grad_norm": 1.5660607972935188, + "learning_rate": 9.875018137024721e-07, + "loss": 1.1729, + "step": 68 + }, + { + "epoch": 0.10139603232916973, + "grad_norm": 1.5666357672603266, + "learning_rate": 9.869666652711049e-07, + "loss": 1.2838, + "step": 69 + }, + { + "epoch": 0.10286554004408523, + "grad_norm": 1.5178831815539953, + "learning_rate": 9.864204499055622e-07, + "loss": 1.2145, + "step": 70 + }, + { + "epoch": 0.10433504775900074, + "grad_norm": 1.71647656199295, + "learning_rate": 9.858631800192804e-07, + "loss": 1.1724, + "step": 71 + }, + { + "epoch": 0.10580455547391623, + "grad_norm": 1.5138224523065718, + "learning_rate": 9.852948682769234e-07, + "loss": 1.2672, + "step": 72 + }, + { + "epoch": 0.10727406318883174, + "grad_norm": 1.5726438604764952, + "learning_rate": 9.84715527594095e-07, + "loss": 1.1505, + "step": 73 + }, + { + "epoch": 0.10874357090374724, + "grad_norm": 1.5196898825337892, + "learning_rate": 9.841251711370457e-07, + "loss": 1.1801, + "step": 74 + }, + { + "epoch": 0.11021307861866275, + "grad_norm": 1.5385492133663783, + "learning_rate": 9.83523812322374e-07, + "loss": 1.2523, + "step": 75 + }, + { + "epoch": 0.11168258633357825, + "grad_norm": 1.4506696813615474, + "learning_rate": 9.829114648167206e-07, + "loss": 1.1798, + "step": 76 + }, + { + "epoch": 0.11315209404849376, + "grad_norm": 1.5471986855965765, + "learning_rate": 9.822881425364578e-07, + "loss": 1.1227, + "step": 77 + }, + { + "epoch": 0.11462160176340926, + "grad_norm": 1.4810764356218553, + "learning_rate": 9.81653859647374e-07, + "loss": 1.1904, + "step": 78 + }, + { + "epoch": 0.11609110947832477, + "grad_norm": 1.458422270393885, + "learning_rate": 9.810086305643511e-07, + "loss": 1.1688, + "step": 79 + }, + { + "epoch": 0.11756061719324026, + "grad_norm": 1.4150019691387836, + "learning_rate": 9.803524699510372e-07, + "loss": 1.1679, + "step": 80 + }, + { + "epoch": 0.11903012490815577, + "grad_norm": 1.5594916312404665, + "learning_rate": 9.79685392719513e-07, + "loss": 1.1915, + "step": 81 + }, + { + "epoch": 0.12049963262307127, + "grad_norm": 1.5378772800136011, + "learning_rate": 9.790074140299535e-07, + "loss": 1.2446, + "step": 82 + }, + { + "epoch": 0.12196914033798678, + "grad_norm": 1.3745260542581244, + "learning_rate": 9.783185492902831e-07, + "loss": 1.1894, + "step": 83 + }, + { + "epoch": 0.12343864805290228, + "grad_norm": 1.4281455021507763, + "learning_rate": 9.776188141558253e-07, + "loss": 1.1535, + "step": 84 + }, + { + "epoch": 0.12490815576781777, + "grad_norm": 1.4827991467262824, + "learning_rate": 9.769082245289472e-07, + "loss": 1.1812, + "step": 85 + }, + { + "epoch": 0.12637766348273327, + "grad_norm": 1.4091217681303907, + "learning_rate": 9.76186796558698e-07, + "loss": 1.1733, + "step": 86 + }, + { + "epoch": 0.1278471711976488, + "grad_norm": 1.504340633323682, + "learning_rate": 9.754545466404423e-07, + "loss": 1.1891, + "step": 87 + }, + { + "epoch": 0.1293166789125643, + "grad_norm": 1.5341255256297364, + "learning_rate": 9.747114914154862e-07, + "loss": 1.1941, + "step": 88 + }, + { + "epoch": 0.1307861866274798, + "grad_norm": 1.448718524801724, + "learning_rate": 9.73957647770701e-07, + "loss": 1.2118, + "step": 89 + }, + { + "epoch": 0.13225569434239529, + "grad_norm": 1.4171871543912984, + "learning_rate": 9.731930328381384e-07, + "loss": 1.1745, + "step": 90 + }, + { + "epoch": 0.1337252020573108, + "grad_norm": 1.487952395310904, + "learning_rate": 9.72417663994641e-07, + "loss": 1.1625, + "step": 91 + }, + { + "epoch": 0.1351947097722263, + "grad_norm": 1.5914017741584792, + "learning_rate": 9.716315588614472e-07, + "loss": 1.1051, + "step": 92 + }, + { + "epoch": 0.1366642174871418, + "grad_norm": 1.6391622049937462, + "learning_rate": 9.708347353037924e-07, + "loss": 1.143, + "step": 93 + }, + { + "epoch": 0.1381337252020573, + "grad_norm": 1.4437878891336424, + "learning_rate": 9.700272114305008e-07, + "loss": 1.1648, + "step": 94 + }, + { + "epoch": 0.13960323291697282, + "grad_norm": 1.4177630579747587, + "learning_rate": 9.69209005593575e-07, + "loss": 1.1999, + "step": 95 + }, + { + "epoch": 0.14107274063188832, + "grad_norm": 1.5120811492994586, + "learning_rate": 9.68380136387779e-07, + "loss": 1.2242, + "step": 96 + }, + { + "epoch": 0.14254224834680382, + "grad_norm": 1.4738631960216506, + "learning_rate": 9.67540622650215e-07, + "loss": 1.1548, + "step": 97 + }, + { + "epoch": 0.14401175606171931, + "grad_norm": 1.5620452244313792, + "learning_rate": 9.66690483459896e-07, + "loss": 1.1875, + "step": 98 + }, + { + "epoch": 0.14548126377663484, + "grad_norm": 1.5673649210031055, + "learning_rate": 9.658297381373117e-07, + "loss": 1.198, + "step": 99 + }, + { + "epoch": 0.14695077149155034, + "grad_norm": 1.4854078674557176, + "learning_rate": 9.649584062439897e-07, + "loss": 1.2242, + "step": 100 + }, + { + "epoch": 0.14842027920646583, + "grad_norm": 1.5887802038965162, + "learning_rate": 9.640765075820508e-07, + "loss": 1.23, + "step": 101 + }, + { + "epoch": 0.14988978692138133, + "grad_norm": 1.5956321149777724, + "learning_rate": 9.631840621937585e-07, + "loss": 1.226, + "step": 102 + }, + { + "epoch": 0.15135929463629685, + "grad_norm": 1.4446949899977264, + "learning_rate": 9.622810903610653e-07, + "loss": 1.1796, + "step": 103 + }, + { + "epoch": 0.15282880235121235, + "grad_norm": 1.4023788130375525, + "learning_rate": 9.613676126051488e-07, + "loss": 1.1589, + "step": 104 + }, + { + "epoch": 0.15429831006612785, + "grad_norm": 1.5491829383620404, + "learning_rate": 9.604436496859482e-07, + "loss": 1.237, + "step": 105 + }, + { + "epoch": 0.15576781778104334, + "grad_norm": 1.3527676295882165, + "learning_rate": 9.595092226016912e-07, + "loss": 1.1329, + "step": 106 + }, + { + "epoch": 0.15723732549595884, + "grad_norm": 1.5174098623642012, + "learning_rate": 9.585643525884163e-07, + "loss": 1.179, + "step": 107 + }, + { + "epoch": 0.15870683321087437, + "grad_norm": 1.3787786506819375, + "learning_rate": 9.576090611194915e-07, + "loss": 1.1199, + "step": 108 + }, + { + "epoch": 0.16017634092578986, + "grad_norm": 1.4679013393236473, + "learning_rate": 9.566433699051248e-07, + "loss": 1.2768, + "step": 109 + }, + { + "epoch": 0.16164584864070536, + "grad_norm": 1.4363459788248092, + "learning_rate": 9.556673008918725e-07, + "loss": 1.1434, + "step": 110 + }, + { + "epoch": 0.16311535635562086, + "grad_norm": 1.5373366464111682, + "learning_rate": 9.546808762621385e-07, + "loss": 1.1391, + "step": 111 + }, + { + "epoch": 0.16458486407053638, + "grad_norm": 1.397660653092303, + "learning_rate": 9.536841184336725e-07, + "loss": 1.1599, + "step": 112 + }, + { + "epoch": 0.16605437178545188, + "grad_norm": 1.5338825854004605, + "learning_rate": 9.526770500590576e-07, + "loss": 1.0599, + "step": 113 + }, + { + "epoch": 0.16752387950036737, + "grad_norm": 1.7726346336471452, + "learning_rate": 9.516596940251986e-07, + "loss": 1.1347, + "step": 114 + }, + { + "epoch": 0.16899338721528287, + "grad_norm": 1.401274657699429, + "learning_rate": 9.506320734527997e-07, + "loss": 1.1538, + "step": 115 + }, + { + "epoch": 0.1704628949301984, + "grad_norm": 1.4313194549405384, + "learning_rate": 9.495942116958395e-07, + "loss": 1.2409, + "step": 116 + }, + { + "epoch": 0.1719324026451139, + "grad_norm": 1.338422913627801, + "learning_rate": 9.485461323410411e-07, + "loss": 1.2274, + "step": 117 + }, + { + "epoch": 0.1734019103600294, + "grad_norm": 1.414029707516539, + "learning_rate": 9.474878592073352e-07, + "loss": 1.1813, + "step": 118 + }, + { + "epoch": 0.17487141807494488, + "grad_norm": 1.5090772581920886, + "learning_rate": 9.464194163453188e-07, + "loss": 1.1576, + "step": 119 + }, + { + "epoch": 0.1763409257898604, + "grad_norm": 1.397279646873424, + "learning_rate": 9.45340828036709e-07, + "loss": 1.1172, + "step": 120 + }, + { + "epoch": 0.1778104335047759, + "grad_norm": 1.4335773048462084, + "learning_rate": 9.442521187937911e-07, + "loss": 1.2413, + "step": 121 + }, + { + "epoch": 0.1792799412196914, + "grad_norm": 1.4697864318337606, + "learning_rate": 9.431533133588616e-07, + "loss": 1.1551, + "step": 122 + }, + { + "epoch": 0.1807494489346069, + "grad_norm": 1.418392328936289, + "learning_rate": 9.420444367036653e-07, + "loss": 1.1411, + "step": 123 + }, + { + "epoch": 0.18221895664952242, + "grad_norm": 1.3863098352226801, + "learning_rate": 9.409255140288288e-07, + "loss": 1.1388, + "step": 124 + }, + { + "epoch": 0.18368846436443792, + "grad_norm": 1.4189204374991593, + "learning_rate": 9.397965707632866e-07, + "loss": 1.1512, + "step": 125 + }, + { + "epoch": 0.18515797207935342, + "grad_norm": 1.524052045769854, + "learning_rate": 9.386576325637043e-07, + "loss": 1.1759, + "step": 126 + }, + { + "epoch": 0.1866274797942689, + "grad_norm": 1.405091479549588, + "learning_rate": 9.375087253138951e-07, + "loss": 1.166, + "step": 127 + }, + { + "epoch": 0.1880969875091844, + "grad_norm": 1.544876861673977, + "learning_rate": 9.363498751242307e-07, + "loss": 1.1309, + "step": 128 + }, + { + "epoch": 0.18956649522409993, + "grad_norm": 1.423691492612366, + "learning_rate": 9.351811083310497e-07, + "loss": 1.1394, + "step": 129 + }, + { + "epoch": 0.19103600293901543, + "grad_norm": 1.466545968863769, + "learning_rate": 9.340024514960574e-07, + "loss": 1.1772, + "step": 130 + }, + { + "epoch": 0.19250551065393093, + "grad_norm": 1.4099417566779877, + "learning_rate": 9.328139314057233e-07, + "loss": 1.116, + "step": 131 + }, + { + "epoch": 0.19397501836884642, + "grad_norm": 1.3751112771189664, + "learning_rate": 9.316155750706713e-07, + "loss": 1.1249, + "step": 132 + }, + { + "epoch": 0.19544452608376195, + "grad_norm": 1.440365523396416, + "learning_rate": 9.304074097250671e-07, + "loss": 1.1607, + "step": 133 + }, + { + "epoch": 0.19691403379867745, + "grad_norm": 1.3429807064760693, + "learning_rate": 9.291894628259979e-07, + "loss": 1.148, + "step": 134 + }, + { + "epoch": 0.19838354151359294, + "grad_norm": 1.6100321463659257, + "learning_rate": 9.279617620528496e-07, + "loss": 1.17, + "step": 135 + }, + { + "epoch": 0.19985304922850844, + "grad_norm": 1.5750772736599945, + "learning_rate": 9.26724335306677e-07, + "loss": 1.1625, + "step": 136 + }, + { + "epoch": 0.20132255694342396, + "grad_norm": 1.4373009264887187, + "learning_rate": 9.254772107095701e-07, + "loss": 1.1595, + "step": 137 + }, + { + "epoch": 0.20279206465833946, + "grad_norm": 1.3522430478527927, + "learning_rate": 9.242204166040148e-07, + "loss": 1.105, + "step": 138 + }, + { + "epoch": 0.20426157237325496, + "grad_norm": 1.496220468060292, + "learning_rate": 9.229539815522485e-07, + "loss": 1.1729, + "step": 139 + }, + { + "epoch": 0.20573108008817045, + "grad_norm": 1.3692383152400416, + "learning_rate": 9.216779343356119e-07, + "loss": 1.0892, + "step": 140 + }, + { + "epoch": 0.20720058780308598, + "grad_norm": 1.415058934382544, + "learning_rate": 9.203923039538939e-07, + "loss": 1.098, + "step": 141 + }, + { + "epoch": 0.20867009551800147, + "grad_norm": 1.3906133545085528, + "learning_rate": 9.190971196246731e-07, + "loss": 1.1862, + "step": 142 + }, + { + "epoch": 0.21013960323291697, + "grad_norm": 1.4853242615363982, + "learning_rate": 9.177924107826535e-07, + "loss": 1.125, + "step": 143 + }, + { + "epoch": 0.21160911094783247, + "grad_norm": 1.4673768423852198, + "learning_rate": 9.164782070789961e-07, + "loss": 1.2024, + "step": 144 + }, + { + "epoch": 0.213078618662748, + "grad_norm": 1.357361299564151, + "learning_rate": 9.151545383806441e-07, + "loss": 1.0744, + "step": 145 + }, + { + "epoch": 0.2145481263776635, + "grad_norm": 1.4090842422695524, + "learning_rate": 9.138214347696453e-07, + "loss": 1.2193, + "step": 146 + }, + { + "epoch": 0.216017634092579, + "grad_norm": 1.403649440383755, + "learning_rate": 9.124789265424674e-07, + "loss": 1.1808, + "step": 147 + }, + { + "epoch": 0.21748714180749448, + "grad_norm": 1.3566301062411306, + "learning_rate": 9.1112704420931e-07, + "loss": 1.1419, + "step": 148 + }, + { + "epoch": 0.21895664952240998, + "grad_norm": 1.3695324273141942, + "learning_rate": 9.097658184934114e-07, + "loss": 1.1495, + "step": 149 + }, + { + "epoch": 0.2204261572373255, + "grad_norm": 1.4138005238611118, + "learning_rate": 9.083952803303496e-07, + "loss": 1.2471, + "step": 150 + }, + { + "epoch": 0.221895664952241, + "grad_norm": 1.3414103194314155, + "learning_rate": 9.070154608673402e-07, + "loss": 1.1672, + "step": 151 + }, + { + "epoch": 0.2233651726671565, + "grad_norm": 1.438725688375383, + "learning_rate": 9.056263914625277e-07, + "loss": 1.2144, + "step": 152 + }, + { + "epoch": 0.224834680382072, + "grad_norm": 1.564966532077944, + "learning_rate": 9.042281036842739e-07, + "loss": 1.1981, + "step": 153 + }, + { + "epoch": 0.22630418809698752, + "grad_norm": 1.275239691393889, + "learning_rate": 9.028206293104391e-07, + "loss": 1.1098, + "step": 154 + }, + { + "epoch": 0.22777369581190302, + "grad_norm": 1.4419905057143438, + "learning_rate": 9.014040003276611e-07, + "loss": 1.1783, + "step": 155 + }, + { + "epoch": 0.2292432035268185, + "grad_norm": 1.4105436779658675, + "learning_rate": 8.999782489306271e-07, + "loss": 1.1513, + "step": 156 + }, + { + "epoch": 0.230712711241734, + "grad_norm": 1.4197672181122762, + "learning_rate": 8.985434075213439e-07, + "loss": 1.1433, + "step": 157 + }, + { + "epoch": 0.23218221895664953, + "grad_norm": 1.3670907633233818, + "learning_rate": 8.970995087083992e-07, + "loss": 1.03, + "step": 158 + }, + { + "epoch": 0.23365172667156503, + "grad_norm": 1.3603227901129349, + "learning_rate": 8.956465853062222e-07, + "loss": 1.0851, + "step": 159 + }, + { + "epoch": 0.23512123438648053, + "grad_norm": 1.3852573758765583, + "learning_rate": 8.941846703343372e-07, + "loss": 1.2005, + "step": 160 + }, + { + "epoch": 0.23659074210139602, + "grad_norm": 1.3638595388577004, + "learning_rate": 8.927137970166135e-07, + "loss": 1.1217, + "step": 161 + }, + { + "epoch": 0.23806024981631155, + "grad_norm": 1.4806310433707142, + "learning_rate": 8.912339987805099e-07, + "loss": 1.1339, + "step": 162 + }, + { + "epoch": 0.23952975753122704, + "grad_norm": 1.4400386045554132, + "learning_rate": 8.897453092563153e-07, + "loss": 1.1343, + "step": 163 + }, + { + "epoch": 0.24099926524614254, + "grad_norm": 1.3697227632539914, + "learning_rate": 8.882477622763846e-07, + "loss": 1.1529, + "step": 164 + }, + { + "epoch": 0.24246877296105804, + "grad_norm": 1.3503406924429706, + "learning_rate": 8.867413918743693e-07, + "loss": 1.164, + "step": 165 + }, + { + "epoch": 0.24393828067597356, + "grad_norm": 1.3850072048890962, + "learning_rate": 8.852262322844444e-07, + "loss": 1.153, + "step": 166 + }, + { + "epoch": 0.24540778839088906, + "grad_norm": 1.359563698104864, + "learning_rate": 8.837023179405308e-07, + "loss": 1.1724, + "step": 167 + }, + { + "epoch": 0.24687729610580456, + "grad_norm": 1.3978149085145892, + "learning_rate": 8.821696834755117e-07, + "loss": 1.0965, + "step": 168 + }, + { + "epoch": 0.24834680382072005, + "grad_norm": 1.404265065477685, + "learning_rate": 8.806283637204462e-07, + "loss": 1.1879, + "step": 169 + }, + { + "epoch": 0.24981631153563555, + "grad_norm": 1.403937125406496, + "learning_rate": 8.790783937037776e-07, + "loss": 1.1251, + "step": 170 + }, + { + "epoch": 0.25128581925055105, + "grad_norm": 1.4373795084844492, + "learning_rate": 8.775198086505375e-07, + "loss": 1.1347, + "step": 171 + }, + { + "epoch": 0.25275532696546654, + "grad_norm": 1.4706572304655665, + "learning_rate": 8.759526439815455e-07, + "loss": 1.0477, + "step": 172 + }, + { + "epoch": 0.2542248346803821, + "grad_norm": 1.4008092483314147, + "learning_rate": 8.743769353126029e-07, + "loss": 1.1761, + "step": 173 + }, + { + "epoch": 0.2556943423952976, + "grad_norm": 1.3764920528768414, + "learning_rate": 8.727927184536849e-07, + "loss": 1.1277, + "step": 174 + }, + { + "epoch": 0.2571638501102131, + "grad_norm": 1.4213614638860257, + "learning_rate": 8.712000294081259e-07, + "loss": 1.0911, + "step": 175 + }, + { + "epoch": 0.2586333578251286, + "grad_norm": 1.3408094274422444, + "learning_rate": 8.695989043718015e-07, + "loss": 1.1602, + "step": 176 + }, + { + "epoch": 0.2601028655400441, + "grad_norm": 1.4025428044724146, + "learning_rate": 8.679893797323058e-07, + "loss": 1.1325, + "step": 177 + }, + { + "epoch": 0.2615723732549596, + "grad_norm": 1.3560427968027609, + "learning_rate": 8.663714920681245e-07, + "loss": 1.1789, + "step": 178 + }, + { + "epoch": 0.2630418809698751, + "grad_norm": 1.4997523461667033, + "learning_rate": 8.64745278147804e-07, + "loss": 1.1578, + "step": 179 + }, + { + "epoch": 0.26451138868479057, + "grad_norm": 1.4091572819165725, + "learning_rate": 8.631107749291148e-07, + "loss": 1.1344, + "step": 180 + }, + { + "epoch": 0.2659808963997061, + "grad_norm": 1.5029746752542426, + "learning_rate": 8.614680195582127e-07, + "loss": 1.1829, + "step": 181 + }, + { + "epoch": 0.2674504041146216, + "grad_norm": 1.4231917336486979, + "learning_rate": 8.598170493687939e-07, + "loss": 1.097, + "step": 182 + }, + { + "epoch": 0.2689199118295371, + "grad_norm": 1.3738447004488614, + "learning_rate": 8.581579018812468e-07, + "loss": 1.1539, + "step": 183 + }, + { + "epoch": 0.2703894195444526, + "grad_norm": 1.4706711617380483, + "learning_rate": 8.564906148017992e-07, + "loss": 1.179, + "step": 184 + }, + { + "epoch": 0.2718589272593681, + "grad_norm": 1.3788046176437054, + "learning_rate": 8.548152260216613e-07, + "loss": 1.1003, + "step": 185 + }, + { + "epoch": 0.2733284349742836, + "grad_norm": 1.3707247735299555, + "learning_rate": 8.531317736161652e-07, + "loss": 1.0604, + "step": 186 + }, + { + "epoch": 0.2747979426891991, + "grad_norm": 1.3354127082342266, + "learning_rate": 8.514402958438987e-07, + "loss": 1.1121, + "step": 187 + }, + { + "epoch": 0.2762674504041146, + "grad_norm": 1.336615144205344, + "learning_rate": 8.497408311458362e-07, + "loss": 1.0885, + "step": 188 + }, + { + "epoch": 0.2777369581190301, + "grad_norm": 1.3645293415775273, + "learning_rate": 8.480334181444652e-07, + "loss": 1.1334, + "step": 189 + }, + { + "epoch": 0.27920646583394565, + "grad_norm": 1.405142088376881, + "learning_rate": 8.463180956429085e-07, + "loss": 1.1685, + "step": 190 + }, + { + "epoch": 0.28067597354886115, + "grad_norm": 1.3868866326557587, + "learning_rate": 8.445949026240424e-07, + "loss": 1.2174, + "step": 191 + }, + { + "epoch": 0.28214548126377664, + "grad_norm": 1.335338615058983, + "learning_rate": 8.428638782496105e-07, + "loss": 1.1235, + "step": 192 + }, + { + "epoch": 0.28361498897869214, + "grad_norm": 1.3713807109243261, + "learning_rate": 8.411250618593337e-07, + "loss": 1.2191, + "step": 193 + }, + { + "epoch": 0.28508449669360764, + "grad_norm": 1.41726090350252, + "learning_rate": 8.393784929700169e-07, + "loss": 1.1522, + "step": 194 + }, + { + "epoch": 0.28655400440852313, + "grad_norm": 1.3529238461119495, + "learning_rate": 8.376242112746499e-07, + "loss": 1.0376, + "step": 195 + }, + { + "epoch": 0.28802351212343863, + "grad_norm": 1.4100766083722753, + "learning_rate": 8.358622566415057e-07, + "loss": 1.0645, + "step": 196 + }, + { + "epoch": 0.2894930198383541, + "grad_norm": 1.3130064224637852, + "learning_rate": 8.340926691132348e-07, + "loss": 1.1029, + "step": 197 + }, + { + "epoch": 0.2909625275532697, + "grad_norm": 1.3572153763312709, + "learning_rate": 8.323154889059549e-07, + "loss": 1.1309, + "step": 198 + }, + { + "epoch": 0.2924320352681852, + "grad_norm": 1.542624539487457, + "learning_rate": 8.305307564083368e-07, + "loss": 1.2663, + "step": 199 + }, + { + "epoch": 0.29390154298310067, + "grad_norm": 1.3525660807398292, + "learning_rate": 8.287385121806869e-07, + "loss": 1.1509, + "step": 200 + }, + { + "epoch": 0.29537105069801617, + "grad_norm": 1.514167322393567, + "learning_rate": 8.26938796954025e-07, + "loss": 1.233, + "step": 201 + }, + { + "epoch": 0.29684055841293167, + "grad_norm": 1.3922047754873006, + "learning_rate": 8.251316516291586e-07, + "loss": 1.1035, + "step": 202 + }, + { + "epoch": 0.29831006612784716, + "grad_norm": 1.450031885451194, + "learning_rate": 8.233171172757539e-07, + "loss": 1.1351, + "step": 203 + }, + { + "epoch": 0.29977957384276266, + "grad_norm": 1.3679188570513114, + "learning_rate": 8.214952351314022e-07, + "loss": 1.105, + "step": 204 + }, + { + "epoch": 0.30124908155767816, + "grad_norm": 1.3734569453159238, + "learning_rate": 8.196660466006823e-07, + "loss": 1.1665, + "step": 205 + }, + { + "epoch": 0.3027185892725937, + "grad_norm": 1.4252740827310968, + "learning_rate": 8.178295932542205e-07, + "loss": 1.1422, + "step": 206 + }, + { + "epoch": 0.3041880969875092, + "grad_norm": 1.38166857353608, + "learning_rate": 8.159859168277444e-07, + "loss": 1.0333, + "step": 207 + }, + { + "epoch": 0.3056576047024247, + "grad_norm": 1.4722910903729798, + "learning_rate": 8.141350592211358e-07, + "loss": 1.134, + "step": 208 + }, + { + "epoch": 0.3071271124173402, + "grad_norm": 1.4074834904053308, + "learning_rate": 8.122770624974778e-07, + "loss": 1.0712, + "step": 209 + }, + { + "epoch": 0.3085966201322557, + "grad_norm": 1.4280613261841697, + "learning_rate": 8.10411968882099e-07, + "loss": 1.1277, + "step": 210 + }, + { + "epoch": 0.3100661278471712, + "grad_norm": 1.3743456352090058, + "learning_rate": 8.085398207616138e-07, + "loss": 1.2189, + "step": 211 + }, + { + "epoch": 0.3115356355620867, + "grad_norm": 1.4591835041502073, + "learning_rate": 8.06660660682959e-07, + "loss": 1.0779, + "step": 212 + }, + { + "epoch": 0.3130051432770022, + "grad_norm": 1.3896625615186315, + "learning_rate": 8.047745313524275e-07, + "loss": 1.118, + "step": 213 + }, + { + "epoch": 0.3144746509919177, + "grad_norm": 1.4095222295795578, + "learning_rate": 8.028814756346967e-07, + "loss": 1.1015, + "step": 214 + }, + { + "epoch": 0.31594415870683323, + "grad_norm": 1.3835385844364931, + "learning_rate": 8.009815365518554e-07, + "loss": 1.1964, + "step": 215 + }, + { + "epoch": 0.31741366642174873, + "grad_norm": 1.4442582574354577, + "learning_rate": 7.990747572824253e-07, + "loss": 1.0994, + "step": 216 + }, + { + "epoch": 0.3188831741366642, + "grad_norm": 1.540269470315885, + "learning_rate": 7.971611811603803e-07, + "loss": 1.1612, + "step": 217 + }, + { + "epoch": 0.3203526818515797, + "grad_norm": 1.5012905943987096, + "learning_rate": 7.952408516741607e-07, + "loss": 1.1212, + "step": 218 + }, + { + "epoch": 0.3218221895664952, + "grad_norm": 1.377730602489123, + "learning_rate": 7.933138124656864e-07, + "loss": 1.136, + "step": 219 + }, + { + "epoch": 0.3232916972814107, + "grad_norm": 1.4387714462293955, + "learning_rate": 7.913801073293638e-07, + "loss": 1.1662, + "step": 220 + }, + { + "epoch": 0.3247612049963262, + "grad_norm": 1.3416069488877052, + "learning_rate": 7.894397802110908e-07, + "loss": 1.1242, + "step": 221 + }, + { + "epoch": 0.3262307127112417, + "grad_norm": 1.3608424909383003, + "learning_rate": 7.87492875207259e-07, + "loss": 1.2513, + "step": 222 + }, + { + "epoch": 0.32770022042615726, + "grad_norm": 1.3266709078386894, + "learning_rate": 7.855394365637495e-07, + "loss": 1.1342, + "step": 223 + }, + { + "epoch": 0.32916972814107276, + "grad_norm": 1.3442846396883863, + "learning_rate": 7.835795086749299e-07, + "loss": 1.1801, + "step": 224 + }, + { + "epoch": 0.33063923585598826, + "grad_norm": 1.4096410255441036, + "learning_rate": 7.816131360826434e-07, + "loss": 1.1195, + "step": 225 + }, + { + "epoch": 0.33210874357090375, + "grad_norm": 1.3103219218263102, + "learning_rate": 7.796403634751973e-07, + "loss": 1.1207, + "step": 226 + }, + { + "epoch": 0.33357825128581925, + "grad_norm": 1.3677511713167119, + "learning_rate": 7.776612356863477e-07, + "loss": 1.0818, + "step": 227 + }, + { + "epoch": 0.33504775900073475, + "grad_norm": 1.4153862144691094, + "learning_rate": 7.756757976942798e-07, + "loss": 1.1155, + "step": 228 + }, + { + "epoch": 0.33651726671565024, + "grad_norm": 1.4438500129836211, + "learning_rate": 7.736840946205865e-07, + "loss": 1.193, + "step": 229 + }, + { + "epoch": 0.33798677443056574, + "grad_norm": 1.456870674708481, + "learning_rate": 7.716861717292424e-07, + "loss": 1.1199, + "step": 230 + }, + { + "epoch": 0.33945628214548124, + "grad_norm": 1.3923730055470556, + "learning_rate": 7.696820744255756e-07, + "loss": 1.2182, + "step": 231 + }, + { + "epoch": 0.3409257898603968, + "grad_norm": 1.370735300132156, + "learning_rate": 7.676718482552353e-07, + "loss": 1.134, + "step": 232 + }, + { + "epoch": 0.3423952975753123, + "grad_norm": 1.3480114555788523, + "learning_rate": 7.65655538903157e-07, + "loss": 1.0998, + "step": 233 + }, + { + "epoch": 0.3438648052902278, + "grad_norm": 1.4276815010976192, + "learning_rate": 7.636331921925241e-07, + "loss": 1.1696, + "step": 234 + }, + { + "epoch": 0.3453343130051433, + "grad_norm": 1.4170232593240883, + "learning_rate": 7.61604854083727e-07, + "loss": 1.116, + "step": 235 + }, + { + "epoch": 0.3468038207200588, + "grad_norm": 1.4134392902146415, + "learning_rate": 7.595705706733178e-07, + "loss": 1.1178, + "step": 236 + }, + { + "epoch": 0.34827332843497427, + "grad_norm": 1.3744159242762746, + "learning_rate": 7.575303881929632e-07, + "loss": 1.1824, + "step": 237 + }, + { + "epoch": 0.34974283614988977, + "grad_norm": 1.38324732435889, + "learning_rate": 7.55484353008394e-07, + "loss": 1.1422, + "step": 238 + }, + { + "epoch": 0.35121234386480527, + "grad_norm": 1.3881791765597504, + "learning_rate": 7.534325116183508e-07, + "loss": 1.1597, + "step": 239 + }, + { + "epoch": 0.3526818515797208, + "grad_norm": 1.396997073324397, + "learning_rate": 7.513749106535278e-07, + "loss": 1.1062, + "step": 240 + }, + { + "epoch": 0.3541513592946363, + "grad_norm": 1.443575506575812, + "learning_rate": 7.493115968755125e-07, + "loss": 1.1133, + "step": 241 + }, + { + "epoch": 0.3556208670095518, + "grad_norm": 1.4651722014658863, + "learning_rate": 7.472426171757238e-07, + "loss": 1.1609, + "step": 242 + }, + { + "epoch": 0.3570903747244673, + "grad_norm": 1.358202452421609, + "learning_rate": 7.451680185743454e-07, + "loss": 1.0965, + "step": 243 + }, + { + "epoch": 0.3585598824393828, + "grad_norm": 1.5175842365493015, + "learning_rate": 7.430878482192579e-07, + "loss": 1.0451, + "step": 244 + }, + { + "epoch": 0.3600293901542983, + "grad_norm": 1.4934774283507497, + "learning_rate": 7.41002153384967e-07, + "loss": 1.168, + "step": 245 + }, + { + "epoch": 0.3614988978692138, + "grad_norm": 1.4081881789418684, + "learning_rate": 7.389109814715292e-07, + "loss": 1.1344, + "step": 246 + }, + { + "epoch": 0.3629684055841293, + "grad_norm": 1.4765256904942174, + "learning_rate": 7.368143800034745e-07, + "loss": 1.1749, + "step": 247 + }, + { + "epoch": 0.36443791329904485, + "grad_norm": 1.3509670984266027, + "learning_rate": 7.347123966287265e-07, + "loss": 1.1334, + "step": 248 + }, + { + "epoch": 0.36590742101396034, + "grad_norm": 1.53782552923439, + "learning_rate": 7.326050791175196e-07, + "loss": 1.2062, + "step": 249 + }, + { + "epoch": 0.36737692872887584, + "grad_norm": 1.3428470092287972, + "learning_rate": 7.304924753613127e-07, + "loss": 1.0949, + "step": 250 + }, + { + "epoch": 0.36737692872887584, + "eval_ical_mcts_chains_sft_val_MORECHAINS_loss": 2.756098508834839, + "eval_ical_mcts_chains_sft_val_MORECHAINS_runtime": 5.9542, + "eval_ical_mcts_chains_sft_val_MORECHAINS_samples_per_second": 15.619, + "eval_ical_mcts_chains_sft_val_MORECHAINS_steps_per_second": 2.015, + "step": 250 + }, + { + "epoch": 0.36884643644379134, + "grad_norm": 1.3548965420628396, + "learning_rate": 7.283746333717014e-07, + "loss": 1.1102, + "step": 251 + }, + { + "epoch": 0.37031594415870683, + "grad_norm": 1.3760628358821343, + "learning_rate": 7.262516012793276e-07, + "loss": 1.0957, + "step": 252 + }, + { + "epoch": 0.37178545187362233, + "grad_norm": 1.4363319701743325, + "learning_rate": 7.241234273327838e-07, + "loss": 1.0782, + "step": 253 + }, + { + "epoch": 0.3732549595885378, + "grad_norm": 1.3292955709874987, + "learning_rate": 7.219901598975185e-07, + "loss": 1.0904, + "step": 254 + }, + { + "epoch": 0.3747244673034533, + "grad_norm": 1.3562568095097107, + "learning_rate": 7.198518474547354e-07, + "loss": 1.0553, + "step": 255 + }, + { + "epoch": 0.3761939750183688, + "grad_norm": 1.3922411026579398, + "learning_rate": 7.17708538600293e-07, + "loss": 1.1265, + "step": 256 + }, + { + "epoch": 0.3776634827332844, + "grad_norm": 1.3917980021891811, + "learning_rate": 7.155602820435992e-07, + "loss": 1.153, + "step": 257 + }, + { + "epoch": 0.37913299044819987, + "grad_norm": 1.4243974023254953, + "learning_rate": 7.134071266065051e-07, + "loss": 1.1354, + "step": 258 + }, + { + "epoch": 0.38060249816311537, + "grad_norm": 1.4529979227761458, + "learning_rate": 7.112491212221946e-07, + "loss": 1.0746, + "step": 259 + }, + { + "epoch": 0.38207200587803086, + "grad_norm": 1.381064754681566, + "learning_rate": 7.09086314934073e-07, + "loss": 1.0771, + "step": 260 + }, + { + "epoch": 0.38354151359294636, + "grad_norm": 1.3494944459929241, + "learning_rate": 7.069187568946524e-07, + "loss": 1.1317, + "step": 261 + }, + { + "epoch": 0.38501102130786186, + "grad_norm": 1.367365128216842, + "learning_rate": 7.047464963644342e-07, + "loss": 1.0841, + "step": 262 + }, + { + "epoch": 0.38648052902277735, + "grad_norm": 1.4111229982907987, + "learning_rate": 7.025695827107901e-07, + "loss": 1.1456, + "step": 263 + }, + { + "epoch": 0.38795003673769285, + "grad_norm": 1.4469253951516285, + "learning_rate": 7.003880654068395e-07, + "loss": 1.1497, + "step": 264 + }, + { + "epoch": 0.3894195444526084, + "grad_norm": 1.438128214166176, + "learning_rate": 6.98201994030326e-07, + "loss": 1.1566, + "step": 265 + }, + { + "epoch": 0.3908890521675239, + "grad_norm": 1.392547852669767, + "learning_rate": 6.960114182624902e-07, + "loss": 1.1729, + "step": 266 + }, + { + "epoch": 0.3923585598824394, + "grad_norm": 1.401971190522276, + "learning_rate": 6.938163878869405e-07, + "loss": 1.0718, + "step": 267 + }, + { + "epoch": 0.3938280675973549, + "grad_norm": 1.396822074711564, + "learning_rate": 6.916169527885221e-07, + "loss": 1.1141, + "step": 268 + }, + { + "epoch": 0.3952975753122704, + "grad_norm": 1.3883010823518094, + "learning_rate": 6.894131629521829e-07, + "loss": 1.2072, + "step": 269 + }, + { + "epoch": 0.3967670830271859, + "grad_norm": 1.400367946344679, + "learning_rate": 6.872050684618381e-07, + "loss": 1.1512, + "step": 270 + }, + { + "epoch": 0.3982365907421014, + "grad_norm": 1.386986813686829, + "learning_rate": 6.849927194992312e-07, + "loss": 1.0693, + "step": 271 + }, + { + "epoch": 0.3997060984570169, + "grad_norm": 1.4393959811216241, + "learning_rate": 6.827761663427943e-07, + "loss": 1.1799, + "step": 272 + }, + { + "epoch": 0.4011756061719324, + "grad_norm": 1.3989135808754347, + "learning_rate": 6.805554593665049e-07, + "loss": 1.1858, + "step": 273 + }, + { + "epoch": 0.4026451138868479, + "grad_norm": 1.3294118191399562, + "learning_rate": 6.783306490387414e-07, + "loss": 1.051, + "step": 274 + }, + { + "epoch": 0.4041146216017634, + "grad_norm": 1.3425765430711993, + "learning_rate": 6.761017859211359e-07, + "loss": 1.0931, + "step": 275 + }, + { + "epoch": 0.4055841293166789, + "grad_norm": 1.3979046769171386, + "learning_rate": 6.738689206674257e-07, + "loss": 1.0426, + "step": 276 + }, + { + "epoch": 0.4070536370315944, + "grad_norm": 1.3494939507393635, + "learning_rate": 6.716321040223014e-07, + "loss": 1.0846, + "step": 277 + }, + { + "epoch": 0.4085231447465099, + "grad_norm": 1.38158252960459, + "learning_rate": 6.693913868202539e-07, + "loss": 1.0751, + "step": 278 + }, + { + "epoch": 0.4099926524614254, + "grad_norm": 1.3216203318442303, + "learning_rate": 6.671468199844192e-07, + "loss": 1.074, + "step": 279 + }, + { + "epoch": 0.4114621601763409, + "grad_norm": 1.3720532658754991, + "learning_rate": 6.648984545254216e-07, + "loss": 1.1034, + "step": 280 + }, + { + "epoch": 0.4129316678912564, + "grad_norm": 1.3638059704989733, + "learning_rate": 6.626463415402131e-07, + "loss": 1.1033, + "step": 281 + }, + { + "epoch": 0.41440117560617196, + "grad_norm": 1.3293227220141846, + "learning_rate": 6.603905322109138e-07, + "loss": 1.1054, + "step": 282 + }, + { + "epoch": 0.41587068332108745, + "grad_norm": 1.4610059929514447, + "learning_rate": 6.581310778036474e-07, + "loss": 0.9441, + "step": 283 + }, + { + "epoch": 0.41734019103600295, + "grad_norm": 1.412604693032899, + "learning_rate": 6.558680296673766e-07, + "loss": 1.1464, + "step": 284 + }, + { + "epoch": 0.41880969875091845, + "grad_norm": 1.3384277299187781, + "learning_rate": 6.536014392327365e-07, + "loss": 1.1396, + "step": 285 + }, + { + "epoch": 0.42027920646583394, + "grad_norm": 1.3987728346478638, + "learning_rate": 6.51331358010865e-07, + "loss": 1.0833, + "step": 286 + }, + { + "epoch": 0.42174871418074944, + "grad_norm": 1.577477591087323, + "learning_rate": 6.490578375922328e-07, + "loss": 1.1428, + "step": 287 + }, + { + "epoch": 0.42321822189566494, + "grad_norm": 1.5588639617673477, + "learning_rate": 6.467809296454708e-07, + "loss": 1.0857, + "step": 288 + }, + { + "epoch": 0.42468772961058043, + "grad_norm": 1.4118713026809537, + "learning_rate": 6.445006859161956e-07, + "loss": 1.1891, + "step": 289 + }, + { + "epoch": 0.426157237325496, + "grad_norm": 1.414060079207387, + "learning_rate": 6.422171582258334e-07, + "loss": 1.0867, + "step": 290 + }, + { + "epoch": 0.4276267450404115, + "grad_norm": 1.4123941837634646, + "learning_rate": 6.399303984704432e-07, + "loss": 1.1262, + "step": 291 + }, + { + "epoch": 0.429096252755327, + "grad_norm": 1.445359468963558, + "learning_rate": 6.376404586195364e-07, + "loss": 1.0714, + "step": 292 + }, + { + "epoch": 0.4305657604702425, + "grad_norm": 1.4230042553733626, + "learning_rate": 6.353473907148961e-07, + "loss": 1.1363, + "step": 293 + }, + { + "epoch": 0.432035268185158, + "grad_norm": 1.325942136523235, + "learning_rate": 6.330512468693944e-07, + "loss": 1.062, + "step": 294 + }, + { + "epoch": 0.43350477590007347, + "grad_norm": 1.4212600729175167, + "learning_rate": 6.307520792658081e-07, + "loss": 1.1282, + "step": 295 + }, + { + "epoch": 0.43497428361498897, + "grad_norm": 1.4254820055813413, + "learning_rate": 6.284499401556328e-07, + "loss": 1.1743, + "step": 296 + }, + { + "epoch": 0.43644379132990446, + "grad_norm": 1.4651887201716516, + "learning_rate": 6.261448818578952e-07, + "loss": 1.0821, + "step": 297 + }, + { + "epoch": 0.43791329904481996, + "grad_norm": 1.360035413094489, + "learning_rate": 6.238369567579642e-07, + "loss": 1.1277, + "step": 298 + }, + { + "epoch": 0.4393828067597355, + "grad_norm": 1.3842334621043852, + "learning_rate": 6.215262173063607e-07, + "loss": 1.134, + "step": 299 + }, + { + "epoch": 0.440852314474651, + "grad_norm": 1.5072722906394864, + "learning_rate": 6.192127160175649e-07, + "loss": 1.1385, + "step": 300 + }, + { + "epoch": 0.4423218221895665, + "grad_norm": 1.3613054784017538, + "learning_rate": 6.168965054688238e-07, + "loss": 1.1117, + "step": 301 + }, + { + "epoch": 0.443791329904482, + "grad_norm": 1.4231270036725268, + "learning_rate": 6.145776382989552e-07, + "loss": 1.0889, + "step": 302 + }, + { + "epoch": 0.4452608376193975, + "grad_norm": 1.4588262408030779, + "learning_rate": 6.122561672071521e-07, + "loss": 1.0359, + "step": 303 + }, + { + "epoch": 0.446730345334313, + "grad_norm": 1.411260640294453, + "learning_rate": 6.099321449517851e-07, + "loss": 1.1288, + "step": 304 + }, + { + "epoch": 0.4481998530492285, + "grad_norm": 1.3956027122525714, + "learning_rate": 6.076056243492035e-07, + "loss": 1.1173, + "step": 305 + }, + { + "epoch": 0.449669360764144, + "grad_norm": 1.4227235203095976, + "learning_rate": 6.052766582725339e-07, + "loss": 1.0394, + "step": 306 + }, + { + "epoch": 0.45113886847905954, + "grad_norm": 1.3911528073579336, + "learning_rate": 6.029452996504801e-07, + "loss": 1.1346, + "step": 307 + }, + { + "epoch": 0.45260837619397504, + "grad_norm": 1.3598137504924201, + "learning_rate": 6.006116014661191e-07, + "loss": 1.1715, + "step": 308 + }, + { + "epoch": 0.45407788390889053, + "grad_norm": 1.3341402291681796, + "learning_rate": 5.982756167556978e-07, + "loss": 1.0628, + "step": 309 + }, + { + "epoch": 0.45554739162380603, + "grad_norm": 1.4336307282930423, + "learning_rate": 5.959373986074269e-07, + "loss": 1.1031, + "step": 310 + }, + { + "epoch": 0.4570168993387215, + "grad_norm": 1.4435742655464976, + "learning_rate": 5.935970001602751e-07, + "loss": 1.0705, + "step": 311 + }, + { + "epoch": 0.458486407053637, + "grad_norm": 1.295586975475687, + "learning_rate": 5.912544746027612e-07, + "loss": 1.082, + "step": 312 + }, + { + "epoch": 0.4599559147685525, + "grad_norm": 1.419413405526378, + "learning_rate": 5.88909875171745e-07, + "loss": 1.1675, + "step": 313 + }, + { + "epoch": 0.461425422483468, + "grad_norm": 1.437607892446711, + "learning_rate": 5.865632551512175e-07, + "loss": 1.0499, + "step": 314 + }, + { + "epoch": 0.4628949301983835, + "grad_norm": 1.308082043000319, + "learning_rate": 5.842146678710911e-07, + "loss": 1.1021, + "step": 315 + }, + { + "epoch": 0.46436443791329907, + "grad_norm": 1.4411999480884474, + "learning_rate": 5.818641667059856e-07, + "loss": 1.115, + "step": 316 + }, + { + "epoch": 0.46583394562821456, + "grad_norm": 1.4248859054466834, + "learning_rate": 5.795118050740169e-07, + "loss": 1.0735, + "step": 317 + }, + { + "epoch": 0.46730345334313006, + "grad_norm": 1.3950677919239058, + "learning_rate": 5.771576364355819e-07, + "loss": 1.1289, + "step": 318 + }, + { + "epoch": 0.46877296105804556, + "grad_norm": 1.4356515357949984, + "learning_rate": 5.748017142921448e-07, + "loss": 1.1771, + "step": 319 + }, + { + "epoch": 0.47024246877296105, + "grad_norm": 1.3803272984145392, + "learning_rate": 5.724440921850195e-07, + "loss": 1.2042, + "step": 320 + }, + { + "epoch": 0.47171197648787655, + "grad_norm": 1.3828275996220702, + "learning_rate": 5.700848236941543e-07, + "loss": 1.1014, + "step": 321 + }, + { + "epoch": 0.47318148420279205, + "grad_norm": 1.4372467929841282, + "learning_rate": 5.677239624369134e-07, + "loss": 1.0635, + "step": 322 + }, + { + "epoch": 0.47465099191770754, + "grad_norm": 1.4412482068570687, + "learning_rate": 5.653615620668589e-07, + "loss": 1.0709, + "step": 323 + }, + { + "epoch": 0.4761204996326231, + "grad_norm": 1.5223869305783915, + "learning_rate": 5.629976762725307e-07, + "loss": 1.0613, + "step": 324 + }, + { + "epoch": 0.4775900073475386, + "grad_norm": 1.4106384412205724, + "learning_rate": 5.606323587762275e-07, + "loss": 1.3318, + "step": 325 + }, + { + "epoch": 0.4790595150624541, + "grad_norm": 1.4808028294260986, + "learning_rate": 5.582656633327848e-07, + "loss": 1.1348, + "step": 326 + }, + { + "epoch": 0.4805290227773696, + "grad_norm": 1.411232450754018, + "learning_rate": 5.558976437283535e-07, + "loss": 1.1455, + "step": 327 + }, + { + "epoch": 0.4819985304922851, + "grad_norm": 1.3874875642549753, + "learning_rate": 5.535283537791785e-07, + "loss": 1.1502, + "step": 328 + }, + { + "epoch": 0.4834680382072006, + "grad_norm": 1.529379416962907, + "learning_rate": 5.511578473303742e-07, + "loss": 1.1306, + "step": 329 + }, + { + "epoch": 0.4849375459221161, + "grad_norm": 1.4353633142872315, + "learning_rate": 5.487861782547017e-07, + "loss": 1.1453, + "step": 330 + }, + { + "epoch": 0.4864070536370316, + "grad_norm": 1.3293126841762646, + "learning_rate": 5.464134004513442e-07, + "loss": 1.1422, + "step": 331 + }, + { + "epoch": 0.4878765613519471, + "grad_norm": 1.3853962798430477, + "learning_rate": 5.440395678446825e-07, + "loss": 1.1431, + "step": 332 + }, + { + "epoch": 0.4893460690668626, + "grad_norm": 1.394633338720216, + "learning_rate": 5.416647343830687e-07, + "loss": 1.0997, + "step": 333 + }, + { + "epoch": 0.4908155767817781, + "grad_norm": 1.3557987209086912, + "learning_rate": 5.392889540376006e-07, + "loss": 1.0817, + "step": 334 + }, + { + "epoch": 0.4922850844966936, + "grad_norm": 2.8096656165818645, + "learning_rate": 5.369122808008955e-07, + "loss": 1.101, + "step": 335 + }, + { + "epoch": 0.4937545922116091, + "grad_norm": 1.463042361823667, + "learning_rate": 5.345347686858626e-07, + "loss": 1.1002, + "step": 336 + }, + { + "epoch": 0.4952240999265246, + "grad_norm": 1.513004558088391, + "learning_rate": 5.321564717244757e-07, + "loss": 1.0977, + "step": 337 + }, + { + "epoch": 0.4966936076414401, + "grad_norm": 1.513966316502475, + "learning_rate": 5.297774439665449e-07, + "loss": 1.1179, + "step": 338 + }, + { + "epoch": 0.4981631153563556, + "grad_norm": 1.3659383294811929, + "learning_rate": 5.273977394784892e-07, + "loss": 1.1009, + "step": 339 + }, + { + "epoch": 0.4996326230712711, + "grad_norm": 1.4406236511077748, + "learning_rate": 5.250174123421068e-07, + "loss": 1.2597, + "step": 340 + }, + { + "epoch": 0.5011021307861866, + "grad_norm": 1.4120228303295752, + "learning_rate": 5.226365166533458e-07, + "loss": 1.1148, + "step": 341 + }, + { + "epoch": 0.5025716385011021, + "grad_norm": 1.3150193319220376, + "learning_rate": 5.202551065210768e-07, + "loss": 1.0589, + "step": 342 + }, + { + "epoch": 0.5040411462160176, + "grad_norm": 1.3552885803220251, + "learning_rate": 5.178732360658605e-07, + "loss": 1.1281, + "step": 343 + }, + { + "epoch": 0.5055106539309331, + "grad_norm": 1.3782165714758237, + "learning_rate": 5.154909594187192e-07, + "loss": 1.115, + "step": 344 + }, + { + "epoch": 0.5069801616458487, + "grad_norm": 1.3839651456991835, + "learning_rate": 5.131083307199071e-07, + "loss": 1.1355, + "step": 345 + }, + { + "epoch": 0.5084496693607642, + "grad_norm": 1.3239848936899146, + "learning_rate": 5.107254041176788e-07, + "loss": 1.144, + "step": 346 + }, + { + "epoch": 0.5099191770756797, + "grad_norm": 1.4174744669080177, + "learning_rate": 5.08342233767059e-07, + "loss": 1.0891, + "step": 347 + }, + { + "epoch": 0.5113886847905952, + "grad_norm": 1.4197584070932192, + "learning_rate": 5.059588738286118e-07, + "loss": 1.1711, + "step": 348 + }, + { + "epoch": 0.5128581925055107, + "grad_norm": 1.460332635848889, + "learning_rate": 5.035753784672105e-07, + "loss": 1.1173, + "step": 349 + }, + { + "epoch": 0.5143277002204262, + "grad_norm": 1.4181458807955096, + "learning_rate": 5.011918018508057e-07, + "loss": 1.1409, + "step": 350 + }, + { + "epoch": 0.5157972079353417, + "grad_norm": 1.401868758466356, + "learning_rate": 4.988081981491944e-07, + "loss": 1.08, + "step": 351 + }, + { + "epoch": 0.5172667156502572, + "grad_norm": 1.4708608243218393, + "learning_rate": 4.964246215327894e-07, + "loss": 1.2088, + "step": 352 + }, + { + "epoch": 0.5187362233651727, + "grad_norm": 1.3087914829912093, + "learning_rate": 4.940411261713882e-07, + "loss": 1.088, + "step": 353 + }, + { + "epoch": 0.5202057310800882, + "grad_norm": 1.3639757292403556, + "learning_rate": 4.91657766232941e-07, + "loss": 1.1503, + "step": 354 + }, + { + "epoch": 0.5216752387950037, + "grad_norm": 1.3230263031119276, + "learning_rate": 4.892745958823213e-07, + "loss": 1.096, + "step": 355 + }, + { + "epoch": 0.5231447465099192, + "grad_norm": 1.3801068609972764, + "learning_rate": 4.868916692800928e-07, + "loss": 1.1064, + "step": 356 + }, + { + "epoch": 0.5246142542248347, + "grad_norm": 1.4818081892670187, + "learning_rate": 4.845090405812809e-07, + "loss": 1.0788, + "step": 357 + }, + { + "epoch": 0.5260837619397501, + "grad_norm": 1.4257930922965831, + "learning_rate": 4.821267639341397e-07, + "loss": 1.0475, + "step": 358 + }, + { + "epoch": 0.5275532696546656, + "grad_norm": 1.3664827162861548, + "learning_rate": 4.797448934789232e-07, + "loss": 1.0764, + "step": 359 + }, + { + "epoch": 0.5290227773695811, + "grad_norm": 1.384036889671994, + "learning_rate": 4.773634833466541e-07, + "loss": 1.0491, + "step": 360 + }, + { + "epoch": 0.5304922850844966, + "grad_norm": 1.3641782448500568, + "learning_rate": 4.7498258765789335e-07, + "loss": 1.0597, + "step": 361 + }, + { + "epoch": 0.5319617927994122, + "grad_norm": 1.423492847149906, + "learning_rate": 4.726022605215108e-07, + "loss": 1.2022, + "step": 362 + }, + { + "epoch": 0.5334313005143277, + "grad_norm": 1.4584247852241217, + "learning_rate": 4.7022255603345504e-07, + "loss": 1.0439, + "step": 363 + }, + { + "epoch": 0.5349008082292432, + "grad_norm": 1.4065237109476447, + "learning_rate": 4.6784352827552433e-07, + "loss": 1.0945, + "step": 364 + }, + { + "epoch": 0.5363703159441587, + "grad_norm": 1.359008408282674, + "learning_rate": 4.6546523131413737e-07, + "loss": 1.0858, + "step": 365 + }, + { + "epoch": 0.5378398236590742, + "grad_norm": 1.395920829743356, + "learning_rate": 4.6308771919910455e-07, + "loss": 1.0821, + "step": 366 + }, + { + "epoch": 0.5393093313739897, + "grad_norm": 1.4101595258715152, + "learning_rate": 4.607110459623994e-07, + "loss": 1.0566, + "step": 367 + }, + { + "epoch": 0.5407788390889052, + "grad_norm": 1.4400551979551548, + "learning_rate": 4.5833526561693146e-07, + "loss": 1.0875, + "step": 368 + }, + { + "epoch": 0.5422483468038207, + "grad_norm": 1.4569650644657774, + "learning_rate": 4.559604321553176e-07, + "loss": 1.0945, + "step": 369 + }, + { + "epoch": 0.5437178545187362, + "grad_norm": 1.4073481731963642, + "learning_rate": 4.535865995486559e-07, + "loss": 1.1326, + "step": 370 + }, + { + "epoch": 0.5451873622336517, + "grad_norm": 1.3843052803402691, + "learning_rate": 4.512138217452984e-07, + "loss": 1.0445, + "step": 371 + }, + { + "epoch": 0.5466568699485672, + "grad_norm": 1.482116494998853, + "learning_rate": 4.488421526696259e-07, + "loss": 1.0871, + "step": 372 + }, + { + "epoch": 0.5481263776634827, + "grad_norm": 1.3248580273316146, + "learning_rate": 4.464716462208216e-07, + "loss": 1.0515, + "step": 373 + }, + { + "epoch": 0.5495958853783982, + "grad_norm": 1.412909824990006, + "learning_rate": 4.441023562716464e-07, + "loss": 1.1317, + "step": 374 + }, + { + "epoch": 0.5510653930933137, + "grad_norm": 1.3923300607754752, + "learning_rate": 4.417343366672154e-07, + "loss": 1.0727, + "step": 375 + }, + { + "epoch": 0.5525349008082292, + "grad_norm": 1.3989792278937028, + "learning_rate": 4.393676412237726e-07, + "loss": 1.1355, + "step": 376 + }, + { + "epoch": 0.5540044085231447, + "grad_norm": 1.3946392251166513, + "learning_rate": 4.370023237274693e-07, + "loss": 1.0268, + "step": 377 + }, + { + "epoch": 0.5554739162380602, + "grad_norm": 1.41416247304468, + "learning_rate": 4.3463843793314123e-07, + "loss": 1.0584, + "step": 378 + }, + { + "epoch": 0.5569434239529758, + "grad_norm": 1.413788607440302, + "learning_rate": 4.322760375630867e-07, + "loss": 1.1846, + "step": 379 + }, + { + "epoch": 0.5584129316678913, + "grad_norm": 1.3360695239659472, + "learning_rate": 4.299151763058457e-07, + "loss": 1.1525, + "step": 380 + }, + { + "epoch": 0.5598824393828068, + "grad_norm": 1.3928904548700607, + "learning_rate": 4.2755590781498056e-07, + "loss": 1.0295, + "step": 381 + }, + { + "epoch": 0.5613519470977223, + "grad_norm": 1.4059368688788658, + "learning_rate": 4.251982857078553e-07, + "loss": 1.0474, + "step": 382 + }, + { + "epoch": 0.5628214548126378, + "grad_norm": 1.5189950174163334, + "learning_rate": 4.2284236356441817e-07, + "loss": 1.1166, + "step": 383 + }, + { + "epoch": 0.5642909625275533, + "grad_norm": 1.5773519459794447, + "learning_rate": 4.204881949259832e-07, + "loss": 1.216, + "step": 384 + }, + { + "epoch": 0.5657604702424688, + "grad_norm": 1.36093051783186, + "learning_rate": 4.181358332940144e-07, + "loss": 1.1028, + "step": 385 + }, + { + "epoch": 0.5672299779573843, + "grad_norm": 1.371801222763194, + "learning_rate": 4.157853321289089e-07, + "loss": 1.1299, + "step": 386 + }, + { + "epoch": 0.5686994856722998, + "grad_norm": 1.4309586056121164, + "learning_rate": 4.1343674484878236e-07, + "loss": 1.0961, + "step": 387 + }, + { + "epoch": 0.5701689933872153, + "grad_norm": 1.3916946581373462, + "learning_rate": 4.11090124828255e-07, + "loss": 1.084, + "step": 388 + }, + { + "epoch": 0.5716385011021308, + "grad_norm": 1.3379667194162195, + "learning_rate": 4.0874552539723873e-07, + "loss": 1.0954, + "step": 389 + }, + { + "epoch": 0.5731080088170463, + "grad_norm": 1.4682746859969864, + "learning_rate": 4.064029998397247e-07, + "loss": 1.123, + "step": 390 + }, + { + "epoch": 0.5745775165319618, + "grad_norm": 1.3797386330911343, + "learning_rate": 4.04062601392573e-07, + "loss": 1.1146, + "step": 391 + }, + { + "epoch": 0.5760470242468773, + "grad_norm": 1.4080533490533538, + "learning_rate": 4.017243832443021e-07, + "loss": 1.1117, + "step": 392 + }, + { + "epoch": 0.5775165319617928, + "grad_norm": 1.4279995751396037, + "learning_rate": 3.993883985338808e-07, + "loss": 1.0809, + "step": 393 + }, + { + "epoch": 0.5789860396767083, + "grad_norm": 1.3175097784254468, + "learning_rate": 3.9705470034951986e-07, + "loss": 1.0567, + "step": 394 + }, + { + "epoch": 0.5804555473916239, + "grad_norm": 1.3671088379708558, + "learning_rate": 3.9472334172746596e-07, + "loss": 1.1201, + "step": 395 + }, + { + "epoch": 0.5819250551065394, + "grad_norm": 1.4728248748732817, + "learning_rate": 3.9239437565079645e-07, + "loss": 1.1274, + "step": 396 + }, + { + "epoch": 0.5833945628214549, + "grad_norm": 1.4007045355088419, + "learning_rate": 3.900678550482147e-07, + "loss": 1.1204, + "step": 397 + }, + { + "epoch": 0.5848640705363704, + "grad_norm": 1.3734381784823617, + "learning_rate": 3.877438327928478e-07, + "loss": 1.1203, + "step": 398 + }, + { + "epoch": 0.5863335782512858, + "grad_norm": 1.4011368340836778, + "learning_rate": 3.854223617010448e-07, + "loss": 1.0533, + "step": 399 + }, + { + "epoch": 0.5878030859662013, + "grad_norm": 1.419012334062334, + "learning_rate": 3.8310349453117617e-07, + "loss": 1.0898, + "step": 400 + }, + { + "epoch": 0.5892725936811168, + "grad_norm": 1.3485874322249378, + "learning_rate": 3.8078728398243503e-07, + "loss": 1.1228, + "step": 401 + }, + { + "epoch": 0.5907421013960323, + "grad_norm": 1.4516071343212644, + "learning_rate": 3.784737826936393e-07, + "loss": 1.0773, + "step": 402 + }, + { + "epoch": 0.5922116091109478, + "grad_norm": 1.2870748285607192, + "learning_rate": 3.761630432420358e-07, + "loss": 1.0489, + "step": 403 + }, + { + "epoch": 0.5936811168258633, + "grad_norm": 1.4253057600978802, + "learning_rate": 3.7385511814210493e-07, + "loss": 1.1059, + "step": 404 + }, + { + "epoch": 0.5951506245407788, + "grad_norm": 1.4362421490175297, + "learning_rate": 3.715500598443672e-07, + "loss": 1.0798, + "step": 405 + }, + { + "epoch": 0.5966201322556943, + "grad_norm": 1.3142558631846668, + "learning_rate": 3.6924792073419193e-07, + "loss": 1.095, + "step": 406 + }, + { + "epoch": 0.5980896399706098, + "grad_norm": 1.4180620072245396, + "learning_rate": 3.6694875313060567e-07, + "loss": 1.1618, + "step": 407 + }, + { + "epoch": 0.5995591476855253, + "grad_norm": 1.4440333294474847, + "learning_rate": 3.646526092851039e-07, + "loss": 1.1742, + "step": 408 + }, + { + "epoch": 0.6010286554004408, + "grad_norm": 1.365950277580121, + "learning_rate": 3.623595413804636e-07, + "loss": 1.1049, + "step": 409 + }, + { + "epoch": 0.6024981631153563, + "grad_norm": 1.3800394626203474, + "learning_rate": 3.600696015295568e-07, + "loss": 1.0812, + "step": 410 + }, + { + "epoch": 0.6039676708302718, + "grad_norm": 1.4309055297758246, + "learning_rate": 3.577828417741665e-07, + "loss": 1.1591, + "step": 411 + }, + { + "epoch": 0.6054371785451874, + "grad_norm": 1.4376053158203683, + "learning_rate": 3.5549931408380446e-07, + "loss": 1.1083, + "step": 412 + }, + { + "epoch": 0.6069066862601029, + "grad_norm": 1.378409944475253, + "learning_rate": 3.5321907035452913e-07, + "loss": 1.0625, + "step": 413 + }, + { + "epoch": 0.6083761939750184, + "grad_norm": 1.4032013297402026, + "learning_rate": 3.509421624077672e-07, + "loss": 1.099, + "step": 414 + }, + { + "epoch": 0.6098457016899339, + "grad_norm": 1.403659794537397, + "learning_rate": 3.486686419891349e-07, + "loss": 1.0488, + "step": 415 + }, + { + "epoch": 0.6113152094048494, + "grad_norm": 1.3788151443608003, + "learning_rate": 3.4639856076726346e-07, + "loss": 1.113, + "step": 416 + }, + { + "epoch": 0.6127847171197649, + "grad_norm": 1.4287954215942635, + "learning_rate": 3.4413197033262343e-07, + "loss": 1.0992, + "step": 417 + }, + { + "epoch": 0.6142542248346804, + "grad_norm": 1.439755106322846, + "learning_rate": 3.4186892219635254e-07, + "loss": 1.0662, + "step": 418 + }, + { + "epoch": 0.6157237325495959, + "grad_norm": 1.4201132330693378, + "learning_rate": 3.396094677890862e-07, + "loss": 1.1108, + "step": 419 + }, + { + "epoch": 0.6171932402645114, + "grad_norm": 1.4151859235424602, + "learning_rate": 3.373536584597869e-07, + "loss": 1.043, + "step": 420 + }, + { + "epoch": 0.6186627479794269, + "grad_norm": 1.3436922877791724, + "learning_rate": 3.3510154547457845e-07, + "loss": 1.1002, + "step": 421 + }, + { + "epoch": 0.6201322556943424, + "grad_norm": 1.3768814743005016, + "learning_rate": 3.3285318001558076e-07, + "loss": 1.099, + "step": 422 + }, + { + "epoch": 0.6216017634092579, + "grad_norm": 1.9818793407102133, + "learning_rate": 3.306086131797462e-07, + "loss": 1.101, + "step": 423 + }, + { + "epoch": 0.6230712711241734, + "grad_norm": 1.5501432398127912, + "learning_rate": 3.283678959776986e-07, + "loss": 1.1401, + "step": 424 + }, + { + "epoch": 0.6245407788390889, + "grad_norm": 1.3451070295410414, + "learning_rate": 3.261310793325742e-07, + "loss": 1.0917, + "step": 425 + }, + { + "epoch": 0.6260102865540044, + "grad_norm": 1.3998218088329646, + "learning_rate": 3.23898214078864e-07, + "loss": 1.1472, + "step": 426 + }, + { + "epoch": 0.6274797942689199, + "grad_norm": 1.4699494469190635, + "learning_rate": 3.216693509612587e-07, + "loss": 1.0937, + "step": 427 + }, + { + "epoch": 0.6289493019838354, + "grad_norm": 1.463414683070946, + "learning_rate": 3.19444540633495e-07, + "loss": 1.1389, + "step": 428 + }, + { + "epoch": 0.630418809698751, + "grad_norm": 1.4765841861514846, + "learning_rate": 3.172238336572056e-07, + "loss": 1.2141, + "step": 429 + }, + { + "epoch": 0.6318883174136665, + "grad_norm": 1.4207175135849464, + "learning_rate": 3.1500728050076873e-07, + "loss": 1.0713, + "step": 430 + }, + { + "epoch": 0.633357825128582, + "grad_norm": 1.3488069296637555, + "learning_rate": 3.1279493153816183e-07, + "loss": 1.0483, + "step": 431 + }, + { + "epoch": 0.6348273328434975, + "grad_norm": 1.4026690915785558, + "learning_rate": 3.1058683704781707e-07, + "loss": 1.0378, + "step": 432 + }, + { + "epoch": 0.636296840558413, + "grad_norm": 1.368530405030037, + "learning_rate": 3.0838304721147803e-07, + "loss": 1.0595, + "step": 433 + }, + { + "epoch": 0.6377663482733285, + "grad_norm": 1.3798655085044127, + "learning_rate": 3.0618361211305956e-07, + "loss": 1.1855, + "step": 434 + }, + { + "epoch": 0.639235855988244, + "grad_norm": 1.4358356459308954, + "learning_rate": 3.0398858173750994e-07, + "loss": 1.1314, + "step": 435 + }, + { + "epoch": 0.6407053637031594, + "grad_norm": 1.3911243386477645, + "learning_rate": 3.0179800596967414e-07, + "loss": 1.1479, + "step": 436 + }, + { + "epoch": 0.6421748714180749, + "grad_norm": 1.3787211953067466, + "learning_rate": 2.996119345931607e-07, + "loss": 1.0852, + "step": 437 + }, + { + "epoch": 0.6436443791329904, + "grad_norm": 1.3784361879287548, + "learning_rate": 2.9743041728921004e-07, + "loss": 1.0617, + "step": 438 + }, + { + "epoch": 0.6451138868479059, + "grad_norm": 1.4563498056244404, + "learning_rate": 2.952535036355659e-07, + "loss": 1.0172, + "step": 439 + }, + { + "epoch": 0.6465833945628214, + "grad_norm": 1.3911833970178598, + "learning_rate": 2.930812431053477e-07, + "loss": 1.1772, + "step": 440 + }, + { + "epoch": 0.6480529022777369, + "grad_norm": 1.456141852044334, + "learning_rate": 2.9091368506592704e-07, + "loss": 1.1163, + "step": 441 + }, + { + "epoch": 0.6495224099926524, + "grad_norm": 1.4409576849555352, + "learning_rate": 2.8875087877780547e-07, + "loss": 1.0099, + "step": 442 + }, + { + "epoch": 0.6509919177075679, + "grad_norm": 1.3739666211555663, + "learning_rate": 2.865928733934951e-07, + "loss": 1.074, + "step": 443 + }, + { + "epoch": 0.6524614254224834, + "grad_norm": 1.4333872958890426, + "learning_rate": 2.844397179564009e-07, + "loss": 1.119, + "step": 444 + }, + { + "epoch": 0.6539309331373989, + "grad_norm": 1.307442428138931, + "learning_rate": 2.8229146139970725e-07, + "loss": 1.0908, + "step": 445 + }, + { + "epoch": 0.6554004408523145, + "grad_norm": 1.364990935232205, + "learning_rate": 2.8014815254526475e-07, + "loss": 1.0506, + "step": 446 + }, + { + "epoch": 0.65686994856723, + "grad_norm": 1.4165052577271036, + "learning_rate": 2.780098401024816e-07, + "loss": 1.1602, + "step": 447 + }, + { + "epoch": 0.6583394562821455, + "grad_norm": 1.4602397372513622, + "learning_rate": 2.7587657266721633e-07, + "loss": 1.0126, + "step": 448 + }, + { + "epoch": 0.659808963997061, + "grad_norm": 1.3583523079909423, + "learning_rate": 2.737483987206725e-07, + "loss": 1.0467, + "step": 449 + }, + { + "epoch": 0.6612784717119765, + "grad_norm": 1.376858530036489, + "learning_rate": 2.7162536662829836e-07, + "loss": 1.0401, + "step": 450 + }, + { + "epoch": 0.662747979426892, + "grad_norm": 1.403999668213476, + "learning_rate": 2.695075246386874e-07, + "loss": 1.0438, + "step": 451 + }, + { + "epoch": 0.6642174871418075, + "grad_norm": 1.3971075292397948, + "learning_rate": 2.673949208824804e-07, + "loss": 1.0229, + "step": 452 + }, + { + "epoch": 0.665686994856723, + "grad_norm": 1.4601504097247877, + "learning_rate": 2.6528760337127344e-07, + "loss": 1.0741, + "step": 453 + }, + { + "epoch": 0.6671565025716385, + "grad_norm": 1.3882632206062708, + "learning_rate": 2.6318561999652543e-07, + "loss": 1.1148, + "step": 454 + }, + { + "epoch": 0.668626010286554, + "grad_norm": 1.4454566469056964, + "learning_rate": 2.610890185284707e-07, + "loss": 1.155, + "step": 455 + }, + { + "epoch": 0.6700955180014695, + "grad_norm": 1.4144159704417698, + "learning_rate": 2.5899784661503306e-07, + "loss": 1.1574, + "step": 456 + }, + { + "epoch": 0.671565025716385, + "grad_norm": 1.3227253044283795, + "learning_rate": 2.569121517807421e-07, + "loss": 1.0814, + "step": 457 + }, + { + "epoch": 0.6730345334313005, + "grad_norm": 1.4016559184728539, + "learning_rate": 2.5483198142565454e-07, + "loss": 1.1234, + "step": 458 + }, + { + "epoch": 0.674504041146216, + "grad_norm": 1.407280713108974, + "learning_rate": 2.5275738282427627e-07, + "loss": 1.1664, + "step": 459 + }, + { + "epoch": 0.6759735488611315, + "grad_norm": 1.4047211167155722, + "learning_rate": 2.506884031244875e-07, + "loss": 1.0663, + "step": 460 + }, + { + "epoch": 0.677443056576047, + "grad_norm": 1.4210107328567902, + "learning_rate": 2.4862508934647215e-07, + "loss": 1.0806, + "step": 461 + }, + { + "epoch": 0.6789125642909625, + "grad_norm": 1.3638119047997457, + "learning_rate": 2.465674883816492e-07, + "loss": 1.1283, + "step": 462 + }, + { + "epoch": 0.6803820720058781, + "grad_norm": 1.3846202967932413, + "learning_rate": 2.445156469916059e-07, + "loss": 1.0391, + "step": 463 + }, + { + "epoch": 0.6818515797207936, + "grad_norm": 1.4492638193493528, + "learning_rate": 2.424696118070367e-07, + "loss": 1.1258, + "step": 464 + }, + { + "epoch": 0.6833210874357091, + "grad_norm": 1.3695917043777497, + "learning_rate": 2.404294293266823e-07, + "loss": 1.0027, + "step": 465 + }, + { + "epoch": 0.6847905951506246, + "grad_norm": 1.3993284836421556, + "learning_rate": 2.3839514591627298e-07, + "loss": 1.1171, + "step": 466 + }, + { + "epoch": 0.6862601028655401, + "grad_norm": 1.3323985034089447, + "learning_rate": 2.3636680780747574e-07, + "loss": 1.1226, + "step": 467 + }, + { + "epoch": 0.6877296105804556, + "grad_norm": 1.3810241812868833, + "learning_rate": 2.3434446109684303e-07, + "loss": 1.1231, + "step": 468 + }, + { + "epoch": 0.6891991182953711, + "grad_norm": 1.3868467700476377, + "learning_rate": 2.323281517447646e-07, + "loss": 1.1609, + "step": 469 + }, + { + "epoch": 0.6906686260102866, + "grad_norm": 1.4013984837064835, + "learning_rate": 2.3031792557442426e-07, + "loss": 1.0457, + "step": 470 + }, + { + "epoch": 0.692138133725202, + "grad_norm": 1.4712416790362257, + "learning_rate": 2.2831382827075758e-07, + "loss": 1.0643, + "step": 471 + }, + { + "epoch": 0.6936076414401176, + "grad_norm": 1.3781792405401019, + "learning_rate": 2.2631590537941348e-07, + "loss": 1.0947, + "step": 472 + }, + { + "epoch": 0.695077149155033, + "grad_norm": 1.4117456798087704, + "learning_rate": 2.2432420230572014e-07, + "loss": 1.1242, + "step": 473 + }, + { + "epoch": 0.6965466568699485, + "grad_norm": 1.3486912287876163, + "learning_rate": 2.223387643136524e-07, + "loss": 1.1288, + "step": 474 + }, + { + "epoch": 0.698016164584864, + "grad_norm": 1.4003361923857591, + "learning_rate": 2.2035963652480266e-07, + "loss": 1.0847, + "step": 475 + }, + { + "epoch": 0.6994856722997795, + "grad_norm": 1.4405126411409146, + "learning_rate": 2.183868639173568e-07, + "loss": 1.0318, + "step": 476 + }, + { + "epoch": 0.700955180014695, + "grad_norm": 1.3517140108244523, + "learning_rate": 2.1642049132507013e-07, + "loss": 1.1525, + "step": 477 + }, + { + "epoch": 0.7024246877296105, + "grad_norm": 1.401420474546432, + "learning_rate": 2.144605634362504e-07, + "loss": 1.0637, + "step": 478 + }, + { + "epoch": 0.7038941954445261, + "grad_norm": 1.4272891896192437, + "learning_rate": 2.125071247927412e-07, + "loss": 1.0649, + "step": 479 + }, + { + "epoch": 0.7053637031594416, + "grad_norm": 1.4147072802744984, + "learning_rate": 2.1056021978890915e-07, + "loss": 1.0819, + "step": 480 + }, + { + "epoch": 0.7068332108743571, + "grad_norm": 1.489071300786935, + "learning_rate": 2.0861989267063622e-07, + "loss": 1.1086, + "step": 481 + }, + { + "epoch": 0.7083027185892726, + "grad_norm": 1.3674178536485293, + "learning_rate": 2.0668618753431372e-07, + "loss": 1.126, + "step": 482 + }, + { + "epoch": 0.7097722263041881, + "grad_norm": 1.4610080335049196, + "learning_rate": 2.0475914832583936e-07, + "loss": 1.1212, + "step": 483 + }, + { + "epoch": 0.7112417340191036, + "grad_norm": 1.3760740804936542, + "learning_rate": 2.0283881883961978e-07, + "loss": 1.1276, + "step": 484 + }, + { + "epoch": 0.7127112417340191, + "grad_norm": 1.3670564421892966, + "learning_rate": 2.0092524271757472e-07, + "loss": 1.0993, + "step": 485 + }, + { + "epoch": 0.7141807494489346, + "grad_norm": 1.394984491530565, + "learning_rate": 1.990184634481446e-07, + "loss": 1.089, + "step": 486 + }, + { + "epoch": 0.7156502571638501, + "grad_norm": 1.4571577516475638, + "learning_rate": 1.9711852436530318e-07, + "loss": 1.125, + "step": 487 + }, + { + "epoch": 0.7171197648787656, + "grad_norm": 1.5499839330419916, + "learning_rate": 1.952254686475726e-07, + "loss": 1.1264, + "step": 488 + }, + { + "epoch": 0.7185892725936811, + "grad_norm": 1.3781629330474126, + "learning_rate": 1.9333933931704098e-07, + "loss": 1.0566, + "step": 489 + }, + { + "epoch": 0.7200587803085966, + "grad_norm": 1.4387738954738303, + "learning_rate": 1.914601792383862e-07, + "loss": 1.0665, + "step": 490 + }, + { + "epoch": 0.7215282880235121, + "grad_norm": 1.3566125913075402, + "learning_rate": 1.8958803111790105e-07, + "loss": 1.0703, + "step": 491 + }, + { + "epoch": 0.7229977957384276, + "grad_norm": 1.3556380519533342, + "learning_rate": 1.877229375025222e-07, + "loss": 1.1557, + "step": 492 + }, + { + "epoch": 0.7244673034533431, + "grad_norm": 1.3370842913006449, + "learning_rate": 1.8586494077886416e-07, + "loss": 1.098, + "step": 493 + }, + { + "epoch": 0.7259368111682586, + "grad_norm": 1.5097558318383415, + "learning_rate": 1.840140831722557e-07, + "loss": 1.1044, + "step": 494 + }, + { + "epoch": 0.7274063188831741, + "grad_norm": 1.4260075475587533, + "learning_rate": 1.821704067457795e-07, + "loss": 1.0891, + "step": 495 + }, + { + "epoch": 0.7288758265980897, + "grad_norm": 1.4163474844245585, + "learning_rate": 1.803339533993175e-07, + "loss": 1.1143, + "step": 496 + }, + { + "epoch": 0.7303453343130052, + "grad_norm": 1.3471773518411894, + "learning_rate": 1.7850476486859784e-07, + "loss": 1.1001, + "step": 497 + }, + { + "epoch": 0.7318148420279207, + "grad_norm": 1.349662778135853, + "learning_rate": 1.766828827242461e-07, + "loss": 1.0691, + "step": 498 + }, + { + "epoch": 0.7332843497428362, + "grad_norm": 1.413952156125001, + "learning_rate": 1.7486834837084147e-07, + "loss": 1.1791, + "step": 499 + }, + { + "epoch": 0.7347538574577517, + "grad_norm": 1.4517736063725735, + "learning_rate": 1.7306120304597516e-07, + "loss": 1.1076, + "step": 500 + }, + { + "epoch": 0.7347538574577517, + "eval_ical_mcts_chains_sft_val_MORECHAINS_loss": 2.7672901153564453, + "eval_ical_mcts_chains_sft_val_MORECHAINS_runtime": 5.874, + "eval_ical_mcts_chains_sft_val_MORECHAINS_samples_per_second": 15.832, + "eval_ical_mcts_chains_sft_val_MORECHAINS_steps_per_second": 2.043, + "step": 500 + }, + { + "epoch": 0.7362233651726672, + "grad_norm": 1.4017708695893076, + "learning_rate": 1.7126148781931309e-07, + "loss": 1.2015, + "step": 501 + }, + { + "epoch": 0.7376928728875827, + "grad_norm": 1.373359024365262, + "learning_rate": 1.6946924359166332e-07, + "loss": 1.1271, + "step": 502 + }, + { + "epoch": 0.7391623806024982, + "grad_norm": 1.3527598369707952, + "learning_rate": 1.6768451109404518e-07, + "loss": 1.1452, + "step": 503 + }, + { + "epoch": 0.7406318883174137, + "grad_norm": 1.4342813387133044, + "learning_rate": 1.659073308867653e-07, + "loss": 1.0873, + "step": 504 + }, + { + "epoch": 0.7421013960323292, + "grad_norm": 1.3934397068509279, + "learning_rate": 1.641377433584945e-07, + "loss": 1.1442, + "step": 505 + }, + { + "epoch": 0.7435709037472447, + "grad_norm": 1.3468387357646852, + "learning_rate": 1.6237578872535023e-07, + "loss": 1.0956, + "step": 506 + }, + { + "epoch": 0.7450404114621602, + "grad_norm": 1.3379393102784192, + "learning_rate": 1.6062150702998307e-07, + "loss": 1.1059, + "step": 507 + }, + { + "epoch": 0.7465099191770757, + "grad_norm": 1.4208106676166712, + "learning_rate": 1.5887493814066632e-07, + "loss": 1.0594, + "step": 508 + }, + { + "epoch": 0.7479794268919912, + "grad_norm": 1.3284093386527018, + "learning_rate": 1.5713612175038953e-07, + "loss": 1.0709, + "step": 509 + }, + { + "epoch": 0.7494489346069066, + "grad_norm": 1.3401638532562588, + "learning_rate": 1.5540509737595752e-07, + "loss": 1.0478, + "step": 510 + }, + { + "epoch": 0.7509184423218221, + "grad_norm": 1.387779204411568, + "learning_rate": 1.536819043570915e-07, + "loss": 1.0749, + "step": 511 + }, + { + "epoch": 0.7523879500367376, + "grad_norm": 1.3361938042809636, + "learning_rate": 1.5196658185553484e-07, + "loss": 1.0633, + "step": 512 + }, + { + "epoch": 0.7538574577516532, + "grad_norm": 1.4464634414727173, + "learning_rate": 1.5025916885416385e-07, + "loss": 1.0915, + "step": 513 + }, + { + "epoch": 0.7553269654665687, + "grad_norm": 1.4564936678530274, + "learning_rate": 1.485597041561014e-07, + "loss": 1.181, + "step": 514 + }, + { + "epoch": 0.7567964731814842, + "grad_norm": 1.3813741239097441, + "learning_rate": 1.4686822638383485e-07, + "loss": 1.1353, + "step": 515 + }, + { + "epoch": 0.7582659808963997, + "grad_norm": 1.3284963146667195, + "learning_rate": 1.4518477397833868e-07, + "loss": 1.0876, + "step": 516 + }, + { + "epoch": 0.7597354886113152, + "grad_norm": 1.3825280093987735, + "learning_rate": 1.4350938519820082e-07, + "loss": 1.1206, + "step": 517 + }, + { + "epoch": 0.7612049963262307, + "grad_norm": 1.4124836120841386, + "learning_rate": 1.4184209811875314e-07, + "loss": 1.0521, + "step": 518 + }, + { + "epoch": 0.7626745040411462, + "grad_norm": 1.3957995246414707, + "learning_rate": 1.401829506312061e-07, + "loss": 1.1441, + "step": 519 + }, + { + "epoch": 0.7641440117560617, + "grad_norm": 2.064618166415374, + "learning_rate": 1.385319804417872e-07, + "loss": 1.0951, + "step": 520 + }, + { + "epoch": 0.7656135194709772, + "grad_norm": 1.3751431109514682, + "learning_rate": 1.3688922507088506e-07, + "loss": 1.0622, + "step": 521 + }, + { + "epoch": 0.7670830271858927, + "grad_norm": 1.4467861329866494, + "learning_rate": 1.35254721852196e-07, + "loss": 1.1695, + "step": 522 + }, + { + "epoch": 0.7685525349008082, + "grad_norm": 1.37927918872005, + "learning_rate": 1.3362850793187536e-07, + "loss": 1.1666, + "step": 523 + }, + { + "epoch": 0.7700220426157237, + "grad_norm": 1.402761123927447, + "learning_rate": 1.3201062026769415e-07, + "loss": 1.1472, + "step": 524 + }, + { + "epoch": 0.7714915503306392, + "grad_norm": 1.4020269134593764, + "learning_rate": 1.3040109562819852e-07, + "loss": 1.1132, + "step": 525 + }, + { + "epoch": 0.7729610580455547, + "grad_norm": 1.3179769606979768, + "learning_rate": 1.2879997059187402e-07, + "loss": 1.1577, + "step": 526 + }, + { + "epoch": 0.7744305657604702, + "grad_norm": 1.3608872827916707, + "learning_rate": 1.27207281546315e-07, + "loss": 1.105, + "step": 527 + }, + { + "epoch": 0.7759000734753857, + "grad_norm": 1.4191003452870128, + "learning_rate": 1.2562306468739707e-07, + "loss": 1.061, + "step": 528 + }, + { + "epoch": 0.7773695811903012, + "grad_norm": 1.42228631098522, + "learning_rate": 1.2404735601845446e-07, + "loss": 1.0901, + "step": 529 + }, + { + "epoch": 0.7788390889052168, + "grad_norm": 1.374432034447507, + "learning_rate": 1.2248019134946224e-07, + "loss": 1.1478, + "step": 530 + }, + { + "epoch": 0.7803085966201323, + "grad_norm": 1.4436649330903892, + "learning_rate": 1.2092160629622243e-07, + "loss": 1.2448, + "step": 531 + }, + { + "epoch": 0.7817781043350478, + "grad_norm": 1.3822744122903736, + "learning_rate": 1.1937163627955388e-07, + "loss": 1.0715, + "step": 532 + }, + { + "epoch": 0.7832476120499633, + "grad_norm": 1.4003007054983763, + "learning_rate": 1.1783031652448844e-07, + "loss": 1.0834, + "step": 533 + }, + { + "epoch": 0.7847171197648788, + "grad_norm": 1.3009002478832499, + "learning_rate": 1.1629768205946916e-07, + "loss": 1.0867, + "step": 534 + }, + { + "epoch": 0.7861866274797943, + "grad_norm": 1.384225049312056, + "learning_rate": 1.1477376771555547e-07, + "loss": 1.111, + "step": 535 + }, + { + "epoch": 0.7876561351947098, + "grad_norm": 1.4951881255718384, + "learning_rate": 1.1325860812563082e-07, + "loss": 1.0905, + "step": 536 + }, + { + "epoch": 0.7891256429096253, + "grad_norm": 1.3855016278493693, + "learning_rate": 1.1175223772361548e-07, + "loss": 1.1674, + "step": 537 + }, + { + "epoch": 0.7905951506245408, + "grad_norm": 1.402883620503283, + "learning_rate": 1.1025469074368465e-07, + "loss": 1.0934, + "step": 538 + }, + { + "epoch": 0.7920646583394563, + "grad_norm": 1.3721282887774249, + "learning_rate": 1.0876600121949014e-07, + "loss": 1.053, + "step": 539 + }, + { + "epoch": 0.7935341660543718, + "grad_norm": 1.3027250754946427, + "learning_rate": 1.0728620298338647e-07, + "loss": 0.9932, + "step": 540 + }, + { + "epoch": 0.7950036737692873, + "grad_norm": 1.3472091716626784, + "learning_rate": 1.058153296656627e-07, + "loss": 1.0411, + "step": 541 + }, + { + "epoch": 0.7964731814842028, + "grad_norm": 1.2515159561759799, + "learning_rate": 1.0435341469377785e-07, + "loss": 1.0409, + "step": 542 + }, + { + "epoch": 0.7979426891991183, + "grad_norm": 1.3955411540097424, + "learning_rate": 1.0290049129160083e-07, + "loss": 1.0847, + "step": 543 + }, + { + "epoch": 0.7994121969140338, + "grad_norm": 1.378388349777933, + "learning_rate": 1.0145659247865606e-07, + "loss": 1.0637, + "step": 544 + }, + { + "epoch": 0.8008817046289493, + "grad_norm": 1.3665203225260263, + "learning_rate": 1.0002175106937282e-07, + "loss": 1.083, + "step": 545 + }, + { + "epoch": 0.8023512123438648, + "grad_norm": 1.3325384707568575, + "learning_rate": 9.859599967233901e-08, + "loss": 1.0619, + "step": 546 + }, + { + "epoch": 0.8038207200587804, + "grad_norm": 1.3079374620261692, + "learning_rate": 9.717937068956083e-08, + "loss": 1.0893, + "step": 547 + }, + { + "epoch": 0.8052902277736959, + "grad_norm": 1.386818107212991, + "learning_rate": 9.577189631572613e-08, + "loss": 1.1115, + "step": 548 + }, + { + "epoch": 0.8067597354886114, + "grad_norm": 1.374316392770813, + "learning_rate": 9.437360853747223e-08, + "loss": 1.1449, + "step": 549 + }, + { + "epoch": 0.8082292432035268, + "grad_norm": 1.345287353676708, + "learning_rate": 9.29845391326598e-08, + "loss": 1.0576, + "step": 550 + }, + { + "epoch": 0.8096987509184423, + "grad_norm": 1.4813724241887494, + "learning_rate": 9.16047196696505e-08, + "loss": 1.092, + "step": 551 + }, + { + "epoch": 0.8111682586333578, + "grad_norm": 1.3806739470510292, + "learning_rate": 9.023418150658863e-08, + "loss": 1.0925, + "step": 552 + }, + { + "epoch": 0.8126377663482733, + "grad_norm": 1.3661938482257079, + "learning_rate": 8.887295579068988e-08, + "loss": 1.0278, + "step": 553 + }, + { + "epoch": 0.8141072740631888, + "grad_norm": 1.376286461998827, + "learning_rate": 8.752107345753262e-08, + "loss": 1.0203, + "step": 554 + }, + { + "epoch": 0.8155767817781043, + "grad_norm": 1.3714175983707415, + "learning_rate": 8.617856523035466e-08, + "loss": 1.0947, + "step": 555 + }, + { + "epoch": 0.8170462894930198, + "grad_norm": 1.3804861761749325, + "learning_rate": 8.484546161935596e-08, + "loss": 1.0772, + "step": 556 + }, + { + "epoch": 0.8185157972079353, + "grad_norm": 1.374313789743847, + "learning_rate": 8.352179292100403e-08, + "loss": 1.0653, + "step": 557 + }, + { + "epoch": 0.8199853049228508, + "grad_norm": 1.3618284844063149, + "learning_rate": 8.220758921734649e-08, + "loss": 1.0513, + "step": 558 + }, + { + "epoch": 0.8214548126377663, + "grad_norm": 1.3803405029205986, + "learning_rate": 8.090288037532706e-08, + "loss": 1.1005, + "step": 559 + }, + { + "epoch": 0.8229243203526818, + "grad_norm": 1.3988974278658783, + "learning_rate": 7.960769604610618e-08, + "loss": 1.1205, + "step": 560 + }, + { + "epoch": 0.8243938280675973, + "grad_norm": 1.3526087062032914, + "learning_rate": 7.83220656643881e-08, + "loss": 1.0322, + "step": 561 + }, + { + "epoch": 0.8258633357825128, + "grad_norm": 1.3213029228844033, + "learning_rate": 7.704601844775155e-08, + "loss": 1.071, + "step": 562 + }, + { + "epoch": 0.8273328434974284, + "grad_norm": 1.4988675658645414, + "learning_rate": 7.577958339598529e-08, + "loss": 1.0068, + "step": 563 + }, + { + "epoch": 0.8288023512123439, + "grad_norm": 1.4472214140414408, + "learning_rate": 7.452278929042982e-08, + "loss": 1.1286, + "step": 564 + }, + { + "epoch": 0.8302718589272594, + "grad_norm": 1.3652634375890536, + "learning_rate": 7.327566469332303e-08, + "loss": 1.1308, + "step": 565 + }, + { + "epoch": 0.8317413666421749, + "grad_norm": 1.3655886385805245, + "learning_rate": 7.203823794715041e-08, + "loss": 1.1015, + "step": 566 + }, + { + "epoch": 0.8332108743570904, + "grad_norm": 1.2502888959822873, + "learning_rate": 7.08105371740021e-08, + "loss": 1.0829, + "step": 567 + }, + { + "epoch": 0.8346803820720059, + "grad_norm": 1.366330743048358, + "learning_rate": 6.959259027493303e-08, + "loss": 1.1187, + "step": 568 + }, + { + "epoch": 0.8361498897869214, + "grad_norm": 1.3645999375102753, + "learning_rate": 6.838442492932867e-08, + "loss": 1.0662, + "step": 569 + }, + { + "epoch": 0.8376193975018369, + "grad_norm": 1.4139397317247722, + "learning_rate": 6.718606859427673e-08, + "loss": 1.1166, + "step": 570 + }, + { + "epoch": 0.8390889052167524, + "grad_norm": 1.4003308125547838, + "learning_rate": 6.599754850394263e-08, + "loss": 1.1006, + "step": 571 + }, + { + "epoch": 0.8405584129316679, + "grad_norm": 1.3719288368595044, + "learning_rate": 6.481889166895033e-08, + "loss": 1.0538, + "step": 572 + }, + { + "epoch": 0.8420279206465834, + "grad_norm": 1.3259310033546516, + "learning_rate": 6.365012487576926e-08, + "loss": 1.0573, + "step": 573 + }, + { + "epoch": 0.8434974283614989, + "grad_norm": 1.4183409718497981, + "learning_rate": 6.249127468610504e-08, + "loss": 1.1412, + "step": 574 + }, + { + "epoch": 0.8449669360764144, + "grad_norm": 1.3764865756818885, + "learning_rate": 6.134236743629562e-08, + "loss": 1.1218, + "step": 575 + }, + { + "epoch": 0.8464364437913299, + "grad_norm": 1.4471292335999226, + "learning_rate": 6.020342923671334e-08, + "loss": 1.0495, + "step": 576 + }, + { + "epoch": 0.8479059515062454, + "grad_norm": 1.4152415184940785, + "learning_rate": 5.907448597117126e-08, + "loss": 1.1413, + "step": 577 + }, + { + "epoch": 0.8493754592211609, + "grad_norm": 1.3749110659106119, + "learning_rate": 5.7955563296334664e-08, + "loss": 1.1659, + "step": 578 + }, + { + "epoch": 0.8508449669360764, + "grad_norm": 1.4403389990835915, + "learning_rate": 5.6846686641138394e-08, + "loss": 1.0724, + "step": 579 + }, + { + "epoch": 0.852314474650992, + "grad_norm": 1.3113544123287635, + "learning_rate": 5.5747881206208936e-08, + "loss": 1.1002, + "step": 580 + }, + { + "epoch": 0.8537839823659075, + "grad_norm": 1.4504739763414178, + "learning_rate": 5.465917196329106e-08, + "loss": 1.0829, + "step": 581 + }, + { + "epoch": 0.855253490080823, + "grad_norm": 1.461654698052296, + "learning_rate": 5.3580583654681266e-08, + "loss": 1.0559, + "step": 582 + }, + { + "epoch": 0.8567229977957385, + "grad_norm": 1.4101135770661162, + "learning_rate": 5.251214079266475e-08, + "loss": 1.069, + "step": 583 + }, + { + "epoch": 0.858192505510654, + "grad_norm": 1.3634335354582763, + "learning_rate": 5.1453867658958704e-08, + "loss": 1.1287, + "step": 584 + }, + { + "epoch": 0.8596620132255695, + "grad_norm": 1.4283589117852606, + "learning_rate": 5.0405788304160426e-08, + "loss": 1.0935, + "step": 585 + }, + { + "epoch": 0.861131520940485, + "grad_norm": 1.434406958645411, + "learning_rate": 4.936792654720029e-08, + "loss": 1.0494, + "step": 586 + }, + { + "epoch": 0.8626010286554004, + "grad_norm": 1.4389756558713183, + "learning_rate": 4.8340305974801266e-08, + "loss": 1.0656, + "step": 587 + }, + { + "epoch": 0.864070536370316, + "grad_norm": 1.3842315564330259, + "learning_rate": 4.7322949940942325e-08, + "loss": 1.0512, + "step": 588 + }, + { + "epoch": 0.8655400440852314, + "grad_norm": 1.4148624042139615, + "learning_rate": 4.63158815663276e-08, + "loss": 1.0613, + "step": 589 + }, + { + "epoch": 0.8670095518001469, + "grad_norm": 1.3366794867264846, + "learning_rate": 4.53191237378614e-08, + "loss": 1.0486, + "step": 590 + }, + { + "epoch": 0.8684790595150624, + "grad_norm": 1.3575430817405785, + "learning_rate": 4.433269910812759e-08, + "loss": 1.0735, + "step": 591 + }, + { + "epoch": 0.8699485672299779, + "grad_norm": 1.3289314595584882, + "learning_rate": 4.335663009487511e-08, + "loss": 1.1759, + "step": 592 + }, + { + "epoch": 0.8714180749448934, + "grad_norm": 1.4000173838833194, + "learning_rate": 4.2390938880508595e-08, + "loss": 1.0408, + "step": 593 + }, + { + "epoch": 0.8728875826598089, + "grad_norm": 1.372416148633152, + "learning_rate": 4.143564741158362e-08, + "loss": 1.0626, + "step": 594 + }, + { + "epoch": 0.8743570903747244, + "grad_norm": 1.4265940660232175, + "learning_rate": 4.0490777398308753e-08, + "loss": 1.2163, + "step": 595 + }, + { + "epoch": 0.8758265980896399, + "grad_norm": 1.3925507856636796, + "learning_rate": 3.955635031405169e-08, + "loss": 1.1028, + "step": 596 + }, + { + "epoch": 0.8772961058045555, + "grad_norm": 1.324576005377705, + "learning_rate": 3.86323873948512e-08, + "loss": 1.182, + "step": 597 + }, + { + "epoch": 0.878765613519471, + "grad_norm": 1.351029057416015, + "learning_rate": 3.771890963893476e-08, + "loss": 1.1285, + "step": 598 + }, + { + "epoch": 0.8802351212343865, + "grad_norm": 1.3994327447522252, + "learning_rate": 3.681593780624137e-08, + "loss": 1.0868, + "step": 599 + }, + { + "epoch": 0.881704628949302, + "grad_norm": 1.3561535752329212, + "learning_rate": 3.5923492417949285e-08, + "loss": 1.1401, + "step": 600 + }, + { + "epoch": 0.8831741366642175, + "grad_norm": 1.3988534133786137, + "learning_rate": 3.5041593756010234e-08, + "loss": 1.0106, + "step": 601 + }, + { + "epoch": 0.884643644379133, + "grad_norm": 1.3859821796457574, + "learning_rate": 3.417026186268829e-08, + "loss": 1.0873, + "step": 602 + }, + { + "epoch": 0.8861131520940485, + "grad_norm": 1.3465915197780411, + "learning_rate": 3.3309516540104e-08, + "loss": 1.1312, + "step": 603 + }, + { + "epoch": 0.887582659808964, + "grad_norm": 1.3934103882388418, + "learning_rate": 3.2459377349784986e-08, + "loss": 1.0802, + "step": 604 + }, + { + "epoch": 0.8890521675238795, + "grad_norm": 1.437718999786439, + "learning_rate": 3.1619863612221075e-08, + "loss": 1.182, + "step": 605 + }, + { + "epoch": 0.890521675238795, + "grad_norm": 1.4477961897782676, + "learning_rate": 3.079099440642496e-08, + "loss": 1.1751, + "step": 606 + }, + { + "epoch": 0.8919911829537105, + "grad_norm": 1.3924612008055068, + "learning_rate": 2.997278856949914e-08, + "loss": 1.0903, + "step": 607 + }, + { + "epoch": 0.893460690668626, + "grad_norm": 1.3907249036661788, + "learning_rate": 2.916526469620756e-08, + "loss": 1.0676, + "step": 608 + }, + { + "epoch": 0.8949301983835415, + "grad_norm": 1.4577046884258984, + "learning_rate": 2.836844113855269e-08, + "loss": 1.2377, + "step": 609 + }, + { + "epoch": 0.896399706098457, + "grad_norm": 1.4063512104136562, + "learning_rate": 2.758233600535914e-08, + "loss": 1.0655, + "step": 610 + }, + { + "epoch": 0.8978692138133725, + "grad_norm": 1.3811245692933716, + "learning_rate": 2.6806967161861593e-08, + "loss": 1.0515, + "step": 611 + }, + { + "epoch": 0.899338721528288, + "grad_norm": 1.4687882128856604, + "learning_rate": 2.6042352229298902e-08, + "loss": 1.0785, + "step": 612 + }, + { + "epoch": 0.9008082292432035, + "grad_norm": 1.4655898408700718, + "learning_rate": 2.5288508584513814e-08, + "loss": 1.0829, + "step": 613 + }, + { + "epoch": 0.9022777369581191, + "grad_norm": 1.3913574357745526, + "learning_rate": 2.4545453359557765e-08, + "loss": 1.0768, + "step": 614 + }, + { + "epoch": 0.9037472446730346, + "grad_norm": 1.3607919197214273, + "learning_rate": 2.3813203441301778e-08, + "loss": 1.1146, + "step": 615 + }, + { + "epoch": 0.9052167523879501, + "grad_norm": 1.4196701845145032, + "learning_rate": 2.3091775471052734e-08, + "loss": 1.1263, + "step": 616 + }, + { + "epoch": 0.9066862601028656, + "grad_norm": 1.3646134363098568, + "learning_rate": 2.2381185844174644e-08, + "loss": 1.027, + "step": 617 + }, + { + "epoch": 0.9081557678177811, + "grad_norm": 1.4522612796957168, + "learning_rate": 2.168145070971683e-08, + "loss": 1.0925, + "step": 618 + }, + { + "epoch": 0.9096252755326966, + "grad_norm": 1.332216974123385, + "learning_rate": 2.099258597004644e-08, + "loss": 1.057, + "step": 619 + }, + { + "epoch": 0.9110947832476121, + "grad_norm": 1.4818724681035569, + "learning_rate": 2.031460728048695e-08, + "loss": 0.9901, + "step": 620 + }, + { + "epoch": 0.9125642909625276, + "grad_norm": 1.3874842041637756, + "learning_rate": 1.9647530048962747e-08, + "loss": 1.0946, + "step": 621 + }, + { + "epoch": 0.914033798677443, + "grad_norm": 1.3490442752252867, + "learning_rate": 1.8991369435648774e-08, + "loss": 1.1295, + "step": 622 + }, + { + "epoch": 0.9155033063923586, + "grad_norm": 1.3339563383295754, + "learning_rate": 1.8346140352625883e-08, + "loss": 1.0182, + "step": 623 + }, + { + "epoch": 0.916972814107274, + "grad_norm": 1.4464185646745273, + "learning_rate": 1.771185746354209e-08, + "loss": 1.0926, + "step": 624 + }, + { + "epoch": 0.9184423218221895, + "grad_norm": 1.3165206597074977, + "learning_rate": 1.7088535183279407e-08, + "loss": 1.1249, + "step": 625 + }, + { + "epoch": 0.919911829537105, + "grad_norm": 1.3819515848899369, + "learning_rate": 1.647618767762593e-08, + "loss": 1.1573, + "step": 626 + }, + { + "epoch": 0.9213813372520205, + "grad_norm": 1.3001816716305787, + "learning_rate": 1.5874828862954327e-08, + "loss": 1.0266, + "step": 627 + }, + { + "epoch": 0.922850844966936, + "grad_norm": 1.319276221215603, + "learning_rate": 1.5284472405905247e-08, + "loss": 1.0694, + "step": 628 + }, + { + "epoch": 0.9243203526818515, + "grad_norm": 1.4228373976898447, + "learning_rate": 1.4705131723076692e-08, + "loss": 1.1516, + "step": 629 + }, + { + "epoch": 0.925789860396767, + "grad_norm": 1.3595993483558204, + "learning_rate": 1.4136819980719472e-08, + "loss": 1.0336, + "step": 630 + }, + { + "epoch": 0.9272593681116826, + "grad_norm": 1.3326370942028576, + "learning_rate": 1.3579550094437676e-08, + "loss": 1.0843, + "step": 631 + }, + { + "epoch": 0.9287288758265981, + "grad_norm": 1.3933226824191673, + "learning_rate": 1.3033334728895119e-08, + "loss": 1.1692, + "step": 632 + }, + { + "epoch": 0.9301983835415136, + "grad_norm": 1.3772541537537315, + "learning_rate": 1.2498186297527802e-08, + "loss": 1.1253, + "step": 633 + }, + { + "epoch": 0.9316678912564291, + "grad_norm": 1.4025797539924674, + "learning_rate": 1.1974116962261527e-08, + "loss": 1.1508, + "step": 634 + }, + { + "epoch": 0.9331373989713446, + "grad_norm": 1.3051053877123164, + "learning_rate": 1.1461138633235611e-08, + "loss": 1.0791, + "step": 635 + }, + { + "epoch": 0.9346069066862601, + "grad_norm": 1.3333191944243534, + "learning_rate": 1.095926296853228e-08, + "loss": 1.1968, + "step": 636 + }, + { + "epoch": 0.9360764144011756, + "grad_norm": 1.3418559426331282, + "learning_rate": 1.0468501373911532e-08, + "loss": 1.0896, + "step": 637 + }, + { + "epoch": 0.9375459221160911, + "grad_norm": 1.34359709316688, + "learning_rate": 9.988865002552138e-09, + "loss": 1.1065, + "step": 638 + }, + { + "epoch": 0.9390154298310066, + "grad_norm": 1.3620227287509608, + "learning_rate": 9.520364754798116e-09, + "loss": 1.0744, + "step": 639 + }, + { + "epoch": 0.9404849375459221, + "grad_norm": 1.3087398924414495, + "learning_rate": 9.06301127791087e-09, + "loss": 1.072, + "step": 640 + }, + { + "epoch": 0.9419544452608376, + "grad_norm": 1.4212527547101943, + "learning_rate": 8.61681496582739e-09, + "loss": 1.0904, + "step": 641 + }, + { + "epoch": 0.9434239529757531, + "grad_norm": 1.4044007797229823, + "learning_rate": 8.181785958923938e-09, + "loss": 1.1127, + "step": 642 + }, + { + "epoch": 0.9448934606906686, + "grad_norm": 1.3492187382144503, + "learning_rate": 7.757934143785561e-09, + "loss": 1.1309, + "step": 643 + }, + { + "epoch": 0.9463629684055841, + "grad_norm": 1.3882974174543632, + "learning_rate": 7.345269152981614e-09, + "loss": 1.0746, + "step": 644 + }, + { + "epoch": 0.9478324761204996, + "grad_norm": 1.3919108675813483, + "learning_rate": 6.943800364846653e-09, + "loss": 1.1149, + "step": 645 + }, + { + "epoch": 0.9493019838354151, + "grad_norm": 1.4427576763993042, + "learning_rate": 6.5535369032672095e-09, + "loss": 1.1289, + "step": 646 + }, + { + "epoch": 0.9507714915503307, + "grad_norm": 1.4253308850896702, + "learning_rate": 6.174487637474801e-09, + "loss": 1.1042, + "step": 647 + }, + { + "epoch": 0.9522409992652462, + "grad_norm": 1.400034959006393, + "learning_rate": 5.806661181843919e-09, + "loss": 1.0803, + "step": 648 + }, + { + "epoch": 0.9537105069801617, + "grad_norm": 1.435130822553809, + "learning_rate": 5.450065895696632e-09, + "loss": 1.0796, + "step": 649 + }, + { + "epoch": 0.9551800146950772, + "grad_norm": 1.4035193875819156, + "learning_rate": 5.1047098831125124e-09, + "loss": 1.1129, + "step": 650 + }, + { + "epoch": 0.9566495224099927, + "grad_norm": 1.5238894343006477, + "learning_rate": 4.770600992744178e-09, + "loss": 1.1102, + "step": 651 + }, + { + "epoch": 0.9581190301249082, + "grad_norm": 1.3438191535373563, + "learning_rate": 4.4477468176393196e-09, + "loss": 1.0581, + "step": 652 + }, + { + "epoch": 0.9595885378398237, + "grad_norm": 1.3010475336208236, + "learning_rate": 4.136154695068006e-09, + "loss": 1.0478, + "step": 653 + }, + { + "epoch": 0.9610580455547392, + "grad_norm": 1.3915831877261908, + "learning_rate": 3.8358317063557635e-09, + "loss": 1.0616, + "step": 654 + }, + { + "epoch": 0.9625275532696547, + "grad_norm": 1.3754622477764384, + "learning_rate": 3.546784676722925e-09, + "loss": 1.0684, + "step": 655 + }, + { + "epoch": 0.9639970609845702, + "grad_norm": 1.3958030478092502, + "learning_rate": 3.2690201751292002e-09, + "loss": 1.1125, + "step": 656 + }, + { + "epoch": 0.9654665686994857, + "grad_norm": 1.4118012322738474, + "learning_rate": 3.002544514124683e-09, + "loss": 1.0673, + "step": 657 + }, + { + "epoch": 0.9669360764144012, + "grad_norm": 1.3931161551594535, + "learning_rate": 2.747363749706244e-09, + "loss": 1.1434, + "step": 658 + }, + { + "epoch": 0.9684055841293167, + "grad_norm": 1.320233907465206, + "learning_rate": 2.5034836811799744e-09, + "loss": 1.0626, + "step": 659 + }, + { + "epoch": 0.9698750918442322, + "grad_norm": 1.40459539845844, + "learning_rate": 2.2709098510292347e-09, + "loss": 1.1047, + "step": 660 + }, + { + "epoch": 0.9713445995591476, + "grad_norm": 1.4426402012148907, + "learning_rate": 2.049647544788813e-09, + "loss": 1.0249, + "step": 661 + }, + { + "epoch": 0.9728141072740631, + "grad_norm": 1.37434918164306, + "learning_rate": 1.8397017909249634e-09, + "loss": 1.0911, + "step": 662 + }, + { + "epoch": 0.9742836149889786, + "grad_norm": 1.4108384424402636, + "learning_rate": 1.6410773607206663e-09, + "loss": 1.1213, + "step": 663 + }, + { + "epoch": 0.9757531227038942, + "grad_norm": 1.4926584455705627, + "learning_rate": 1.4537787681677683e-09, + "loss": 1.1082, + "step": 664 + }, + { + "epoch": 0.9772226304188097, + "grad_norm": 1.3595606693253082, + "learning_rate": 1.2778102698638993e-09, + "loss": 1.0751, + "step": 665 + }, + { + "epoch": 0.9786921381337252, + "grad_norm": 1.4509988964421765, + "learning_rate": 1.1131758649160494e-09, + "loss": 1.0865, + "step": 666 + }, + { + "epoch": 0.9801616458486407, + "grad_norm": 1.3679120087919863, + "learning_rate": 9.598792948496414e-10, + "loss": 1.0855, + "step": 667 + }, + { + "epoch": 0.9816311535635562, + "grad_norm": 1.346685189196511, + "learning_rate": 8.179240435232659e-10, + "loss": 1.0846, + "step": 668 + }, + { + "epoch": 0.9831006612784717, + "grad_norm": 1.3196217753280912, + "learning_rate": 6.873133370498551e-10, + "loss": 1.0291, + "step": 669 + }, + { + "epoch": 0.9845701689933872, + "grad_norm": 1.3795048616719339, + "learning_rate": 5.680501437230755e-10, + "loss": 1.0951, + "step": 670 + }, + { + "epoch": 0.9860396767083027, + "grad_norm": 1.3485249223159093, + "learning_rate": 4.6013717395010365e-10, + "loss": 1.0971, + "step": 671 + }, + { + "epoch": 0.9875091844232182, + "grad_norm": 1.507892762769766, + "learning_rate": 3.63576880189731e-10, + "loss": 1.1302, + "step": 672 + }, + { + "epoch": 0.9889786921381337, + "grad_norm": 1.362116408246938, + "learning_rate": 2.783714568970197e-10, + "loss": 1.1335, + "step": 673 + }, + { + "epoch": 0.9904481998530492, + "grad_norm": 1.4376872456307725, + "learning_rate": 2.045228404731203e-10, + "loss": 1.0601, + "step": 674 + }, + { + "epoch": 0.9919177075679647, + "grad_norm": 1.3558644824462847, + "learning_rate": 1.4203270922125143e-10, + "loss": 1.0517, + "step": 675 + }, + { + "epoch": 0.9933872152828802, + "grad_norm": 1.4975627645392091, + "learning_rate": 9.090248330889671e-11, + "loss": 1.0872, + "step": 676 + }, + { + "epoch": 0.9948567229977957, + "grad_norm": 1.5061190709997394, + "learning_rate": 5.1133324735164183e-11, + "loss": 1.0224, + "step": 677 + }, + { + "epoch": 0.9963262307127112, + "grad_norm": 1.4530193394151236, + "learning_rate": 2.2726137304529546e-11, + "loss": 1.0538, + "step": 678 + }, + { + "epoch": 0.9977957384276267, + "grad_norm": 1.3983394789120556, + "learning_rate": 5.6815666063525505e-12, + "loss": 1.0876, + "step": 679 + }, + { + "epoch": 0.9992652461425422, + "grad_norm": 1.4207479289834348, + "learning_rate": 0.0, + "loss": 1.1348, + "step": 680 + } + ], + "logging_steps": 1, + "max_steps": 680, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 711821505200128.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}