gsarch's picture
Initial checkpoint upload
ece0c9d verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9992652461425422,
"eval_steps": 250,
"global_step": 680,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0014695077149155032,
"grad_norm": 17.0145143368335,
"learning_rate": 4.7619047619047613e-08,
"loss": 1.4955,
"step": 1
},
{
"epoch": 0.0029390154298310064,
"grad_norm": 16.880312139442445,
"learning_rate": 9.523809523809523e-08,
"loss": 1.5062,
"step": 2
},
{
"epoch": 0.00440852314474651,
"grad_norm": 17.464051339263715,
"learning_rate": 1.4285714285714285e-07,
"loss": 1.4331,
"step": 3
},
{
"epoch": 0.005878030859662013,
"grad_norm": 17.225034100163874,
"learning_rate": 1.9047619047619045e-07,
"loss": 1.4944,
"step": 4
},
{
"epoch": 0.0073475385745775165,
"grad_norm": 16.85132116546643,
"learning_rate": 2.3809523809523806e-07,
"loss": 1.5036,
"step": 5
},
{
"epoch": 0.00881704628949302,
"grad_norm": 17.041527845979022,
"learning_rate": 2.857142857142857e-07,
"loss": 1.4956,
"step": 6
},
{
"epoch": 0.010286554004408524,
"grad_norm": 15.881569858334686,
"learning_rate": 3.333333333333333e-07,
"loss": 1.5216,
"step": 7
},
{
"epoch": 0.011756061719324026,
"grad_norm": 16.31701070286596,
"learning_rate": 3.809523809523809e-07,
"loss": 1.3997,
"step": 8
},
{
"epoch": 0.01322556943423953,
"grad_norm": 14.776486508864684,
"learning_rate": 4.285714285714285e-07,
"loss": 1.4804,
"step": 9
},
{
"epoch": 0.014695077149155033,
"grad_norm": 14.557720212887741,
"learning_rate": 4.761904761904761e-07,
"loss": 1.5391,
"step": 10
},
{
"epoch": 0.016164584864070537,
"grad_norm": 10.131668683974828,
"learning_rate": 5.238095238095238e-07,
"loss": 1.402,
"step": 11
},
{
"epoch": 0.01763409257898604,
"grad_norm": 10.271144542549273,
"learning_rate": 5.714285714285714e-07,
"loss": 1.5026,
"step": 12
},
{
"epoch": 0.019103600293901544,
"grad_norm": 9.729493771535271,
"learning_rate": 6.19047619047619e-07,
"loss": 1.4665,
"step": 13
},
{
"epoch": 0.020573108008817047,
"grad_norm": 9.027774183461707,
"learning_rate": 6.666666666666666e-07,
"loss": 1.43,
"step": 14
},
{
"epoch": 0.02204261572373255,
"grad_norm": 7.227763994209855,
"learning_rate": 7.142857142857143e-07,
"loss": 1.4461,
"step": 15
},
{
"epoch": 0.02351212343864805,
"grad_norm": 6.971794402178872,
"learning_rate": 7.619047619047618e-07,
"loss": 1.3946,
"step": 16
},
{
"epoch": 0.024981631153563555,
"grad_norm": 6.515516782524198,
"learning_rate": 8.095238095238095e-07,
"loss": 1.387,
"step": 17
},
{
"epoch": 0.02645113886847906,
"grad_norm": 6.20741883421894,
"learning_rate": 8.57142857142857e-07,
"loss": 1.3349,
"step": 18
},
{
"epoch": 0.027920646583394562,
"grad_norm": 6.090657009640401,
"learning_rate": 9.047619047619047e-07,
"loss": 1.4995,
"step": 19
},
{
"epoch": 0.029390154298310066,
"grad_norm": 4.979473826437438,
"learning_rate": 9.523809523809522e-07,
"loss": 1.2842,
"step": 20
},
{
"epoch": 0.03085966201322557,
"grad_norm": 4.732515914373556,
"learning_rate": 1e-06,
"loss": 1.2816,
"step": 21
},
{
"epoch": 0.03232916972814107,
"grad_norm": 4.780927371855901,
"learning_rate": 9.999943184333936e-07,
"loss": 1.3153,
"step": 22
},
{
"epoch": 0.03379867744305658,
"grad_norm": 4.494683448782365,
"learning_rate": 9.999772738626954e-07,
"loss": 1.3077,
"step": 23
},
{
"epoch": 0.03526818515797208,
"grad_norm": 4.329105160843414,
"learning_rate": 9.999488666752648e-07,
"loss": 1.2857,
"step": 24
},
{
"epoch": 0.036737692872887584,
"grad_norm": 4.180952310668344,
"learning_rate": 9.99909097516691e-07,
"loss": 1.3751,
"step": 25
},
{
"epoch": 0.03820720058780309,
"grad_norm": 4.033165642753543,
"learning_rate": 9.998579672907788e-07,
"loss": 1.3178,
"step": 26
},
{
"epoch": 0.03967670830271859,
"grad_norm": 3.4246600875739426,
"learning_rate": 9.99795477159527e-07,
"loss": 1.2873,
"step": 27
},
{
"epoch": 0.041146216017634095,
"grad_norm": 3.274963547004543,
"learning_rate": 9.99721628543103e-07,
"loss": 1.1994,
"step": 28
},
{
"epoch": 0.0426157237325496,
"grad_norm": 3.331357181342743,
"learning_rate": 9.996364231198103e-07,
"loss": 1.2571,
"step": 29
},
{
"epoch": 0.0440852314474651,
"grad_norm": 3.3784313618805992,
"learning_rate": 9.9953986282605e-07,
"loss": 1.2402,
"step": 30
},
{
"epoch": 0.045554739162380606,
"grad_norm": 3.0698830421191072,
"learning_rate": 9.99431949856277e-07,
"loss": 1.2305,
"step": 31
},
{
"epoch": 0.0470242468772961,
"grad_norm": 3.4883161944534082,
"learning_rate": 9.9931268666295e-07,
"loss": 1.2108,
"step": 32
},
{
"epoch": 0.048493754592211606,
"grad_norm": 2.7892474532912663,
"learning_rate": 9.991820759564766e-07,
"loss": 1.2711,
"step": 33
},
{
"epoch": 0.04996326230712711,
"grad_norm": 2.8458896007246666,
"learning_rate": 9.990401207051504e-07,
"loss": 1.173,
"step": 34
},
{
"epoch": 0.05143277002204261,
"grad_norm": 2.5260926512388284,
"learning_rate": 9.98886824135084e-07,
"loss": 1.2025,
"step": 35
},
{
"epoch": 0.05290227773695812,
"grad_norm": 2.3877298964215483,
"learning_rate": 9.98722189730136e-07,
"loss": 1.2579,
"step": 36
},
{
"epoch": 0.05437178545187362,
"grad_norm": 2.2852004307826195,
"learning_rate": 9.985462212318322e-07,
"loss": 1.2378,
"step": 37
},
{
"epoch": 0.055841293166789124,
"grad_norm": 2.36994043645677,
"learning_rate": 9.983589226392792e-07,
"loss": 1.1434,
"step": 38
},
{
"epoch": 0.05731080088170463,
"grad_norm": 2.3285859279194665,
"learning_rate": 9.98160298209075e-07,
"loss": 1.2527,
"step": 39
},
{
"epoch": 0.05878030859662013,
"grad_norm": 2.051942671724978,
"learning_rate": 9.97950352455211e-07,
"loss": 1.1443,
"step": 40
},
{
"epoch": 0.060249816311535635,
"grad_norm": 2.0353118851309318,
"learning_rate": 9.977290901489707e-07,
"loss": 1.1874,
"step": 41
},
{
"epoch": 0.06171932402645114,
"grad_norm": 2.250387145318064,
"learning_rate": 9.9749651631882e-07,
"loss": 1.1452,
"step": 42
},
{
"epoch": 0.06318883174136664,
"grad_norm": 2.179679997701691,
"learning_rate": 9.972526362502937e-07,
"loss": 1.2108,
"step": 43
},
{
"epoch": 0.06465833945628215,
"grad_norm": 2.110595858158538,
"learning_rate": 9.969974554858754e-07,
"loss": 1.1414,
"step": 44
},
{
"epoch": 0.06612784717119764,
"grad_norm": 2.2405504959664535,
"learning_rate": 9.967309798248707e-07,
"loss": 1.1567,
"step": 45
},
{
"epoch": 0.06759735488611315,
"grad_norm": 2.1992469613966414,
"learning_rate": 9.96453215323277e-07,
"loss": 1.1649,
"step": 46
},
{
"epoch": 0.06906686260102865,
"grad_norm": 2.1870509425005804,
"learning_rate": 9.961641682936442e-07,
"loss": 1.1701,
"step": 47
},
{
"epoch": 0.07053637031594416,
"grad_norm": 2.144331046664921,
"learning_rate": 9.95863845304932e-07,
"loss": 1.1498,
"step": 48
},
{
"epoch": 0.07200587803085966,
"grad_norm": 2.1944289892742774,
"learning_rate": 9.955522531823606e-07,
"loss": 1.2148,
"step": 49
},
{
"epoch": 0.07347538574577517,
"grad_norm": 2.1596366801336244,
"learning_rate": 9.952293990072557e-07,
"loss": 1.1744,
"step": 50
},
{
"epoch": 0.07494489346069066,
"grad_norm": 2.1930183892423547,
"learning_rate": 9.948952901168874e-07,
"loss": 1.1859,
"step": 51
},
{
"epoch": 0.07641440117560618,
"grad_norm": 2.1938721237512153,
"learning_rate": 9.945499341043033e-07,
"loss": 1.1199,
"step": 52
},
{
"epoch": 0.07788390889052167,
"grad_norm": 1.9945902863761755,
"learning_rate": 9.94193338818156e-07,
"loss": 1.1325,
"step": 53
},
{
"epoch": 0.07935341660543718,
"grad_norm": 1.9432383836576688,
"learning_rate": 9.938255123625251e-07,
"loss": 1.1723,
"step": 54
},
{
"epoch": 0.08082292432035268,
"grad_norm": 1.9494589261620257,
"learning_rate": 9.934464630967328e-07,
"loss": 1.1796,
"step": 55
},
{
"epoch": 0.08229243203526819,
"grad_norm": 2.1536664626140762,
"learning_rate": 9.930561996351533e-07,
"loss": 1.1687,
"step": 56
},
{
"epoch": 0.08376193975018369,
"grad_norm": 1.8384845414400017,
"learning_rate": 9.926547308470183e-07,
"loss": 1.0806,
"step": 57
},
{
"epoch": 0.0852314474650992,
"grad_norm": 2.0289382431580183,
"learning_rate": 9.922420658562144e-07,
"loss": 1.11,
"step": 58
},
{
"epoch": 0.0867009551800147,
"grad_norm": 2.048447539695954,
"learning_rate": 9.91818214041076e-07,
"loss": 1.1103,
"step": 59
},
{
"epoch": 0.0881704628949302,
"grad_norm": 2.012578737651733,
"learning_rate": 9.913831850341725e-07,
"loss": 1.1158,
"step": 60
},
{
"epoch": 0.0896399706098457,
"grad_norm": 1.896428918202559,
"learning_rate": 9.90936988722089e-07,
"loss": 1.1864,
"step": 61
},
{
"epoch": 0.09110947832476121,
"grad_norm": 1.9545200740474544,
"learning_rate": 9.904796352452019e-07,
"loss": 1.0444,
"step": 62
},
{
"epoch": 0.09257898603967671,
"grad_norm": 2.0148450645381266,
"learning_rate": 9.900111349974478e-07,
"loss": 1.0916,
"step": 63
},
{
"epoch": 0.0940484937545922,
"grad_norm": 1.929357772553932,
"learning_rate": 9.895314986260886e-07,
"loss": 1.0798,
"step": 64
},
{
"epoch": 0.09551800146950772,
"grad_norm": 1.7614353081727012,
"learning_rate": 9.890407370314677e-07,
"loss": 1.0602,
"step": 65
},
{
"epoch": 0.09698750918442321,
"grad_norm": 1.812427621605287,
"learning_rate": 9.885388613667644e-07,
"loss": 1.1419,
"step": 66
},
{
"epoch": 0.09845701689933872,
"grad_norm": 1.955547061142271,
"learning_rate": 9.880258830377386e-07,
"loss": 1.1659,
"step": 67
},
{
"epoch": 0.09992652461425422,
"grad_norm": 1.884597834486459,
"learning_rate": 9.875018137024721e-07,
"loss": 1.0962,
"step": 68
},
{
"epoch": 0.10139603232916973,
"grad_norm": 1.9999685034920418,
"learning_rate": 9.869666652711049e-07,
"loss": 1.2142,
"step": 69
},
{
"epoch": 0.10286554004408523,
"grad_norm": 1.8727812783053652,
"learning_rate": 9.864204499055622e-07,
"loss": 1.1424,
"step": 70
},
{
"epoch": 0.10433504775900074,
"grad_norm": 1.9270273635952284,
"learning_rate": 9.858631800192804e-07,
"loss": 1.0969,
"step": 71
},
{
"epoch": 0.10580455547391623,
"grad_norm": 2.0055713055636355,
"learning_rate": 9.852948682769234e-07,
"loss": 1.1969,
"step": 72
},
{
"epoch": 0.10727406318883174,
"grad_norm": 1.8861847545246249,
"learning_rate": 9.84715527594095e-07,
"loss": 1.0814,
"step": 73
},
{
"epoch": 0.10874357090374724,
"grad_norm": 1.8457360601707475,
"learning_rate": 9.841251711370457e-07,
"loss": 1.1158,
"step": 74
},
{
"epoch": 0.11021307861866275,
"grad_norm": 1.9137633871950495,
"learning_rate": 9.83523812322374e-07,
"loss": 1.175,
"step": 75
},
{
"epoch": 0.11168258633357825,
"grad_norm": 1.7751460296642658,
"learning_rate": 9.829114648167206e-07,
"loss": 1.1066,
"step": 76
},
{
"epoch": 0.11315209404849376,
"grad_norm": 1.8728263698426233,
"learning_rate": 9.822881425364578e-07,
"loss": 1.0487,
"step": 77
},
{
"epoch": 0.11462160176340926,
"grad_norm": 1.720777832083087,
"learning_rate": 9.81653859647374e-07,
"loss": 1.1196,
"step": 78
},
{
"epoch": 0.11609110947832477,
"grad_norm": 1.87348358379571,
"learning_rate": 9.810086305643511e-07,
"loss": 1.0961,
"step": 79
},
{
"epoch": 0.11756061719324026,
"grad_norm": 1.7818512548748757,
"learning_rate": 9.803524699510372e-07,
"loss": 1.0888,
"step": 80
},
{
"epoch": 0.11903012490815577,
"grad_norm": 1.9374157076694885,
"learning_rate": 9.79685392719513e-07,
"loss": 1.1268,
"step": 81
},
{
"epoch": 0.12049963262307127,
"grad_norm": 2.026201319532345,
"learning_rate": 9.790074140299535e-07,
"loss": 1.1795,
"step": 82
},
{
"epoch": 0.12196914033798678,
"grad_norm": 1.7669713418401083,
"learning_rate": 9.783185492902831e-07,
"loss": 1.1169,
"step": 83
},
{
"epoch": 0.12343864805290228,
"grad_norm": 1.8294253428890799,
"learning_rate": 9.776188141558253e-07,
"loss": 1.0919,
"step": 84
},
{
"epoch": 0.12490815576781777,
"grad_norm": 1.8274541174825418,
"learning_rate": 9.769082245289472e-07,
"loss": 1.1123,
"step": 85
},
{
"epoch": 0.12637766348273327,
"grad_norm": 1.7514454813059583,
"learning_rate": 9.76186796558698e-07,
"loss": 1.1126,
"step": 86
},
{
"epoch": 0.1278471711976488,
"grad_norm": 2.0848609899286177,
"learning_rate": 9.754545466404423e-07,
"loss": 1.1261,
"step": 87
},
{
"epoch": 0.1293166789125643,
"grad_norm": 1.8956503679400867,
"learning_rate": 9.747114914154862e-07,
"loss": 1.13,
"step": 88
},
{
"epoch": 0.1307861866274798,
"grad_norm": 1.8870487896185353,
"learning_rate": 9.73957647770701e-07,
"loss": 1.1469,
"step": 89
},
{
"epoch": 0.13225569434239529,
"grad_norm": 1.817308542114037,
"learning_rate": 9.731930328381384e-07,
"loss": 1.1055,
"step": 90
},
{
"epoch": 0.1337252020573108,
"grad_norm": 1.9629911385480598,
"learning_rate": 9.72417663994641e-07,
"loss": 1.0912,
"step": 91
},
{
"epoch": 0.1351947097722263,
"grad_norm": 2.050021078478476,
"learning_rate": 9.716315588614472e-07,
"loss": 1.042,
"step": 92
},
{
"epoch": 0.1366642174871418,
"grad_norm": 1.9655965919411047,
"learning_rate": 9.708347353037924e-07,
"loss": 1.0731,
"step": 93
},
{
"epoch": 0.1381337252020573,
"grad_norm": 1.848057789367187,
"learning_rate": 9.700272114305008e-07,
"loss": 1.0957,
"step": 94
},
{
"epoch": 0.13960323291697282,
"grad_norm": 1.9055742505733597,
"learning_rate": 9.69209005593575e-07,
"loss": 1.1393,
"step": 95
},
{
"epoch": 0.14107274063188832,
"grad_norm": 1.885959701690901,
"learning_rate": 9.68380136387779e-07,
"loss": 1.1503,
"step": 96
},
{
"epoch": 0.14254224834680382,
"grad_norm": 1.9216191651411287,
"learning_rate": 9.67540622650215e-07,
"loss": 1.0944,
"step": 97
},
{
"epoch": 0.14401175606171931,
"grad_norm": 1.9221872996269755,
"learning_rate": 9.66690483459896e-07,
"loss": 1.125,
"step": 98
},
{
"epoch": 0.14548126377663484,
"grad_norm": 1.7695227554963031,
"learning_rate": 9.658297381373117e-07,
"loss": 1.134,
"step": 99
},
{
"epoch": 0.14695077149155034,
"grad_norm": 1.940990407598247,
"learning_rate": 9.649584062439897e-07,
"loss": 1.1638,
"step": 100
},
{
"epoch": 0.14842027920646583,
"grad_norm": 1.8521569502420978,
"learning_rate": 9.640765075820508e-07,
"loss": 1.1594,
"step": 101
},
{
"epoch": 0.14988978692138133,
"grad_norm": 1.9347402212683402,
"learning_rate": 9.631840621937585e-07,
"loss": 1.1594,
"step": 102
},
{
"epoch": 0.15135929463629685,
"grad_norm": 1.9130373667873128,
"learning_rate": 9.622810903610653e-07,
"loss": 1.1211,
"step": 103
},
{
"epoch": 0.15282880235121235,
"grad_norm": 1.813394824124808,
"learning_rate": 9.613676126051488e-07,
"loss": 1.1007,
"step": 104
},
{
"epoch": 0.15429831006612785,
"grad_norm": 1.9095237417724393,
"learning_rate": 9.604436496859482e-07,
"loss": 1.1796,
"step": 105
},
{
"epoch": 0.15576781778104334,
"grad_norm": 1.730720179443255,
"learning_rate": 9.595092226016912e-07,
"loss": 1.0689,
"step": 106
},
{
"epoch": 0.15723732549595884,
"grad_norm": 1.8826322939474722,
"learning_rate": 9.585643525884163e-07,
"loss": 1.112,
"step": 107
},
{
"epoch": 0.15870683321087437,
"grad_norm": 1.792447444970345,
"learning_rate": 9.576090611194915e-07,
"loss": 1.0629,
"step": 108
},
{
"epoch": 0.16017634092578986,
"grad_norm": 1.851908198387569,
"learning_rate": 9.566433699051248e-07,
"loss": 1.2135,
"step": 109
},
{
"epoch": 0.16164584864070536,
"grad_norm": 1.7553040384102077,
"learning_rate": 9.556673008918725e-07,
"loss": 1.0854,
"step": 110
},
{
"epoch": 0.16311535635562086,
"grad_norm": 1.910506976862337,
"learning_rate": 9.546808762621385e-07,
"loss": 1.0775,
"step": 111
},
{
"epoch": 0.16458486407053638,
"grad_norm": 1.7403892411303623,
"learning_rate": 9.536841184336725e-07,
"loss": 1.102,
"step": 112
},
{
"epoch": 0.16605437178545188,
"grad_norm": 1.925268524053033,
"learning_rate": 9.526770500590576e-07,
"loss": 1.0036,
"step": 113
},
{
"epoch": 0.16752387950036737,
"grad_norm": 1.7030002409038387,
"learning_rate": 9.516596940251986e-07,
"loss": 1.0702,
"step": 114
},
{
"epoch": 0.16899338721528287,
"grad_norm": 1.7840043977843556,
"learning_rate": 9.506320734527997e-07,
"loss": 1.0938,
"step": 115
},
{
"epoch": 0.1704628949301984,
"grad_norm": 1.8395314071839606,
"learning_rate": 9.495942116958395e-07,
"loss": 1.168,
"step": 116
},
{
"epoch": 0.1719324026451139,
"grad_norm": 1.7231121731862031,
"learning_rate": 9.485461323410411e-07,
"loss": 1.1674,
"step": 117
},
{
"epoch": 0.1734019103600294,
"grad_norm": 1.7743694765332263,
"learning_rate": 9.474878592073352e-07,
"loss": 1.1154,
"step": 118
},
{
"epoch": 0.17487141807494488,
"grad_norm": 1.889101434336393,
"learning_rate": 9.464194163453188e-07,
"loss": 1.0967,
"step": 119
},
{
"epoch": 0.1763409257898604,
"grad_norm": 1.791474621378483,
"learning_rate": 9.45340828036709e-07,
"loss": 1.0532,
"step": 120
},
{
"epoch": 0.1778104335047759,
"grad_norm": 1.7298856198753365,
"learning_rate": 9.442521187937911e-07,
"loss": 1.1783,
"step": 121
},
{
"epoch": 0.1792799412196914,
"grad_norm": 1.794443334367641,
"learning_rate": 9.431533133588616e-07,
"loss": 1.0877,
"step": 122
},
{
"epoch": 0.1807494489346069,
"grad_norm": 1.8423954972585201,
"learning_rate": 9.420444367036653e-07,
"loss": 1.0762,
"step": 123
},
{
"epoch": 0.18221895664952242,
"grad_norm": 1.7929621526317383,
"learning_rate": 9.409255140288288e-07,
"loss": 1.0728,
"step": 124
},
{
"epoch": 0.18368846436443792,
"grad_norm": 1.90760615166962,
"learning_rate": 9.397965707632866e-07,
"loss": 1.0913,
"step": 125
},
{
"epoch": 0.18515797207935342,
"grad_norm": 1.943714257581344,
"learning_rate": 9.386576325637043e-07,
"loss": 1.1129,
"step": 126
},
{
"epoch": 0.1866274797942689,
"grad_norm": 1.8505495879818339,
"learning_rate": 9.375087253138951e-07,
"loss": 1.1095,
"step": 127
},
{
"epoch": 0.1880969875091844,
"grad_norm": 1.8351144551335388,
"learning_rate": 9.363498751242307e-07,
"loss": 1.0664,
"step": 128
},
{
"epoch": 0.18956649522409993,
"grad_norm": 1.8698266517180562,
"learning_rate": 9.351811083310497e-07,
"loss": 1.0695,
"step": 129
},
{
"epoch": 0.19103600293901543,
"grad_norm": 1.8467428381107462,
"learning_rate": 9.340024514960574e-07,
"loss": 1.1099,
"step": 130
},
{
"epoch": 0.19250551065393093,
"grad_norm": 1.8383445717272457,
"learning_rate": 9.328139314057233e-07,
"loss": 1.0571,
"step": 131
},
{
"epoch": 0.19397501836884642,
"grad_norm": 1.7822708434524877,
"learning_rate": 9.316155750706713e-07,
"loss": 1.0643,
"step": 132
},
{
"epoch": 0.19544452608376195,
"grad_norm": 1.775080931488546,
"learning_rate": 9.304074097250671e-07,
"loss": 1.0923,
"step": 133
},
{
"epoch": 0.19691403379867745,
"grad_norm": 1.7534496873865253,
"learning_rate": 9.291894628259979e-07,
"loss": 1.0934,
"step": 134
},
{
"epoch": 0.19838354151359294,
"grad_norm": 1.9045369286252938,
"learning_rate": 9.279617620528496e-07,
"loss": 1.1063,
"step": 135
},
{
"epoch": 0.19985304922850844,
"grad_norm": 1.9789176992872752,
"learning_rate": 9.26724335306677e-07,
"loss": 1.1022,
"step": 136
},
{
"epoch": 0.20132255694342396,
"grad_norm": 1.8458671473525012,
"learning_rate": 9.254772107095701e-07,
"loss": 1.0984,
"step": 137
},
{
"epoch": 0.20279206465833946,
"grad_norm": 1.6935951448294548,
"learning_rate": 9.242204166040148e-07,
"loss": 1.0379,
"step": 138
},
{
"epoch": 0.20426157237325496,
"grad_norm": 1.9390186462720589,
"learning_rate": 9.229539815522485e-07,
"loss": 1.1088,
"step": 139
},
{
"epoch": 0.20573108008817045,
"grad_norm": 1.8215087232144447,
"learning_rate": 9.216779343356119e-07,
"loss": 1.0305,
"step": 140
},
{
"epoch": 0.20720058780308598,
"grad_norm": 1.8391377902744837,
"learning_rate": 9.203923039538939e-07,
"loss": 1.03,
"step": 141
},
{
"epoch": 0.20867009551800147,
"grad_norm": 1.8222570147638864,
"learning_rate": 9.190971196246731e-07,
"loss": 1.1201,
"step": 142
},
{
"epoch": 0.21013960323291697,
"grad_norm": 1.9074452445907568,
"learning_rate": 9.177924107826535e-07,
"loss": 1.0716,
"step": 143
},
{
"epoch": 0.21160911094783247,
"grad_norm": 1.8604141888521697,
"learning_rate": 9.164782070789961e-07,
"loss": 1.1475,
"step": 144
},
{
"epoch": 0.213078618662748,
"grad_norm": 1.8195345943864918,
"learning_rate": 9.151545383806441e-07,
"loss": 1.0211,
"step": 145
},
{
"epoch": 0.2145481263776635,
"grad_norm": 1.7827672387312539,
"learning_rate": 9.138214347696453e-07,
"loss": 1.163,
"step": 146
},
{
"epoch": 0.216017634092579,
"grad_norm": 1.861693336411916,
"learning_rate": 9.124789265424674e-07,
"loss": 1.1053,
"step": 147
},
{
"epoch": 0.21748714180749448,
"grad_norm": 1.9925460288336267,
"learning_rate": 9.1112704420931e-07,
"loss": 1.0823,
"step": 148
},
{
"epoch": 0.21895664952240998,
"grad_norm": 1.8673762233131246,
"learning_rate": 9.097658184934114e-07,
"loss": 1.0893,
"step": 149
},
{
"epoch": 0.2204261572373255,
"grad_norm": 1.8135361820690228,
"learning_rate": 9.083952803303496e-07,
"loss": 1.1804,
"step": 150
},
{
"epoch": 0.221895664952241,
"grad_norm": 1.7943832305382856,
"learning_rate": 9.070154608673402e-07,
"loss": 1.1108,
"step": 151
},
{
"epoch": 0.2233651726671565,
"grad_norm": 1.9887027909424768,
"learning_rate": 9.056263914625277e-07,
"loss": 1.1501,
"step": 152
},
{
"epoch": 0.224834680382072,
"grad_norm": 1.9146935645186336,
"learning_rate": 9.042281036842739e-07,
"loss": 1.1279,
"step": 153
},
{
"epoch": 0.22630418809698752,
"grad_norm": 1.7369719877012806,
"learning_rate": 9.028206293104391e-07,
"loss": 1.0497,
"step": 154
},
{
"epoch": 0.22777369581190302,
"grad_norm": 1.8786686675134308,
"learning_rate": 9.014040003276611e-07,
"loss": 1.1104,
"step": 155
},
{
"epoch": 0.2292432035268185,
"grad_norm": 1.9105299787467294,
"learning_rate": 8.999782489306271e-07,
"loss": 1.0936,
"step": 156
},
{
"epoch": 0.230712711241734,
"grad_norm": 1.8120395385442194,
"learning_rate": 8.985434075213439e-07,
"loss": 1.083,
"step": 157
},
{
"epoch": 0.23218221895664953,
"grad_norm": 1.7363191743038984,
"learning_rate": 8.970995087083992e-07,
"loss": 0.9772,
"step": 158
},
{
"epoch": 0.23365172667156503,
"grad_norm": 1.7820020797816543,
"learning_rate": 8.956465853062222e-07,
"loss": 1.0146,
"step": 159
},
{
"epoch": 0.23512123438648053,
"grad_norm": 1.7586524905646472,
"learning_rate": 8.941846703343372e-07,
"loss": 1.1279,
"step": 160
},
{
"epoch": 0.23659074210139602,
"grad_norm": 1.7894968447971773,
"learning_rate": 8.927137970166135e-07,
"loss": 1.0527,
"step": 161
},
{
"epoch": 0.23806024981631155,
"grad_norm": 1.7848783538320003,
"learning_rate": 8.912339987805099e-07,
"loss": 1.0751,
"step": 162
},
{
"epoch": 0.23952975753122704,
"grad_norm": 1.8693803940347191,
"learning_rate": 8.897453092563153e-07,
"loss": 1.0697,
"step": 163
},
{
"epoch": 0.24099926524614254,
"grad_norm": 1.8074883029338882,
"learning_rate": 8.882477622763846e-07,
"loss": 1.0944,
"step": 164
},
{
"epoch": 0.24246877296105804,
"grad_norm": 1.797204547554038,
"learning_rate": 8.867413918743693e-07,
"loss": 1.0973,
"step": 165
},
{
"epoch": 0.24393828067597356,
"grad_norm": 1.8392322504850838,
"learning_rate": 8.852262322844444e-07,
"loss": 1.0919,
"step": 166
},
{
"epoch": 0.24540778839088906,
"grad_norm": 1.7623077046993691,
"learning_rate": 8.837023179405308e-07,
"loss": 1.1094,
"step": 167
},
{
"epoch": 0.24687729610580456,
"grad_norm": 1.7742943941570524,
"learning_rate": 8.821696834755117e-07,
"loss": 1.0372,
"step": 168
},
{
"epoch": 0.24834680382072005,
"grad_norm": 1.787531814654772,
"learning_rate": 8.806283637204462e-07,
"loss": 1.1277,
"step": 169
},
{
"epoch": 0.24981631153563555,
"grad_norm": 1.8409448609213654,
"learning_rate": 8.790783937037776e-07,
"loss": 1.0667,
"step": 170
},
{
"epoch": 0.25128581925055105,
"grad_norm": 1.8241238807877374,
"learning_rate": 8.775198086505375e-07,
"loss": 1.0712,
"step": 171
},
{
"epoch": 0.25275532696546654,
"grad_norm": 1.9461903617468934,
"learning_rate": 8.759526439815455e-07,
"loss": 0.986,
"step": 172
},
{
"epoch": 0.2542248346803821,
"grad_norm": 1.8059963729135269,
"learning_rate": 8.743769353126029e-07,
"loss": 1.1027,
"step": 173
},
{
"epoch": 0.2556943423952976,
"grad_norm": 1.7678488694251377,
"learning_rate": 8.727927184536849e-07,
"loss": 1.0659,
"step": 174
},
{
"epoch": 0.2571638501102131,
"grad_norm": 1.7749804271044742,
"learning_rate": 8.712000294081259e-07,
"loss": 1.022,
"step": 175
},
{
"epoch": 0.2586333578251286,
"grad_norm": 1.707242563839455,
"learning_rate": 8.695989043718015e-07,
"loss": 1.0917,
"step": 176
},
{
"epoch": 0.2601028655400441,
"grad_norm": 1.8427148257842918,
"learning_rate": 8.679893797323058e-07,
"loss": 1.0724,
"step": 177
},
{
"epoch": 0.2615723732549596,
"grad_norm": 1.8363003275755554,
"learning_rate": 8.663714920681245e-07,
"loss": 1.1111,
"step": 178
},
{
"epoch": 0.2630418809698751,
"grad_norm": 1.8365044852133108,
"learning_rate": 8.64745278147804e-07,
"loss": 1.0855,
"step": 179
},
{
"epoch": 0.26451138868479057,
"grad_norm": 1.8731659851674847,
"learning_rate": 8.631107749291148e-07,
"loss": 1.0675,
"step": 180
},
{
"epoch": 0.2659808963997061,
"grad_norm": 1.9141757031458602,
"learning_rate": 8.614680195582127e-07,
"loss": 1.1196,
"step": 181
},
{
"epoch": 0.2674504041146216,
"grad_norm": 1.8358916498700413,
"learning_rate": 8.598170493687939e-07,
"loss": 1.0301,
"step": 182
},
{
"epoch": 0.2689199118295371,
"grad_norm": 1.7578208636926487,
"learning_rate": 8.581579018812468e-07,
"loss": 1.089,
"step": 183
},
{
"epoch": 0.2703894195444526,
"grad_norm": 1.8746198379934478,
"learning_rate": 8.564906148017992e-07,
"loss": 1.1131,
"step": 184
},
{
"epoch": 0.2718589272593681,
"grad_norm": 1.7730333983609725,
"learning_rate": 8.548152260216613e-07,
"loss": 1.034,
"step": 185
},
{
"epoch": 0.2733284349742836,
"grad_norm": 1.7445016041336918,
"learning_rate": 8.531317736161652e-07,
"loss": 1.002,
"step": 186
},
{
"epoch": 0.2747979426891991,
"grad_norm": 1.716730493002548,
"learning_rate": 8.514402958438987e-07,
"loss": 1.0445,
"step": 187
},
{
"epoch": 0.2762674504041146,
"grad_norm": 1.753035784826817,
"learning_rate": 8.497408311458362e-07,
"loss": 1.0203,
"step": 188
},
{
"epoch": 0.2777369581190301,
"grad_norm": 1.7624662841844527,
"learning_rate": 8.480334181444652e-07,
"loss": 1.0724,
"step": 189
},
{
"epoch": 0.27920646583394565,
"grad_norm": 1.8184473786367534,
"learning_rate": 8.463180956429085e-07,
"loss": 1.1048,
"step": 190
},
{
"epoch": 0.28067597354886115,
"grad_norm": 1.7695724382850178,
"learning_rate": 8.445949026240424e-07,
"loss": 1.1535,
"step": 191
},
{
"epoch": 0.28214548126377664,
"grad_norm": 1.7396814430482903,
"learning_rate": 8.428638782496105e-07,
"loss": 1.0529,
"step": 192
},
{
"epoch": 0.28361498897869214,
"grad_norm": 1.7614538875912402,
"learning_rate": 8.411250618593337e-07,
"loss": 1.1533,
"step": 193
},
{
"epoch": 0.28508449669360764,
"grad_norm": 1.8555305634941939,
"learning_rate": 8.393784929700169e-07,
"loss": 1.0832,
"step": 194
},
{
"epoch": 0.28655400440852313,
"grad_norm": 1.7138697869173838,
"learning_rate": 8.376242112746499e-07,
"loss": 0.9737,
"step": 195
},
{
"epoch": 0.28802351212343863,
"grad_norm": 1.8888153946382773,
"learning_rate": 8.358622566415057e-07,
"loss": 1.0011,
"step": 196
},
{
"epoch": 0.2894930198383541,
"grad_norm": 1.7635141256389373,
"learning_rate": 8.340926691132348e-07,
"loss": 1.0477,
"step": 197
},
{
"epoch": 0.2909625275532697,
"grad_norm": 1.736090710284999,
"learning_rate": 8.323154889059549e-07,
"loss": 1.0677,
"step": 198
},
{
"epoch": 0.2924320352681852,
"grad_norm": 1.9655528448046806,
"learning_rate": 8.305307564083368e-07,
"loss": 1.1974,
"step": 199
},
{
"epoch": 0.29390154298310067,
"grad_norm": 1.7937853629471947,
"learning_rate": 8.287385121806869e-07,
"loss": 1.0823,
"step": 200
},
{
"epoch": 0.29537105069801617,
"grad_norm": 1.9050615593905893,
"learning_rate": 8.26938796954025e-07,
"loss": 1.1672,
"step": 201
},
{
"epoch": 0.29684055841293167,
"grad_norm": 1.7624130253100436,
"learning_rate": 8.251316516291586e-07,
"loss": 1.0203,
"step": 202
},
{
"epoch": 0.29831006612784716,
"grad_norm": 1.816075547247136,
"learning_rate": 8.233171172757539e-07,
"loss": 1.0583,
"step": 203
},
{
"epoch": 0.29977957384276266,
"grad_norm": 1.750336923146612,
"learning_rate": 8.214952351314022e-07,
"loss": 1.0375,
"step": 204
},
{
"epoch": 0.30124908155767816,
"grad_norm": 1.766779650906924,
"learning_rate": 8.196660466006823e-07,
"loss": 1.0997,
"step": 205
},
{
"epoch": 0.3027185892725937,
"grad_norm": 1.8860580284916557,
"learning_rate": 8.178295932542205e-07,
"loss": 1.0788,
"step": 206
},
{
"epoch": 0.3041880969875092,
"grad_norm": 1.7781746925928104,
"learning_rate": 8.159859168277444e-07,
"loss": 0.968,
"step": 207
},
{
"epoch": 0.3056576047024247,
"grad_norm": 1.8615566220877084,
"learning_rate": 8.141350592211358e-07,
"loss": 1.066,
"step": 208
},
{
"epoch": 0.3071271124173402,
"grad_norm": 1.7774696373511238,
"learning_rate": 8.122770624974778e-07,
"loss": 0.9954,
"step": 209
},
{
"epoch": 0.3085966201322557,
"grad_norm": 1.8457945487006882,
"learning_rate": 8.10411968882099e-07,
"loss": 1.0606,
"step": 210
},
{
"epoch": 0.3100661278471712,
"grad_norm": 1.8137334109126484,
"learning_rate": 8.085398207616138e-07,
"loss": 1.1464,
"step": 211
},
{
"epoch": 0.3115356355620867,
"grad_norm": 1.824787905787397,
"learning_rate": 8.06660660682959e-07,
"loss": 1.0186,
"step": 212
},
{
"epoch": 0.3130051432770022,
"grad_norm": 1.8040685491884534,
"learning_rate": 8.047745313524275e-07,
"loss": 1.0554,
"step": 213
},
{
"epoch": 0.3144746509919177,
"grad_norm": 1.7370025880384308,
"learning_rate": 8.028814756346967e-07,
"loss": 1.0432,
"step": 214
},
{
"epoch": 0.31594415870683323,
"grad_norm": 1.7648192357449863,
"learning_rate": 8.009815365518554e-07,
"loss": 1.1266,
"step": 215
},
{
"epoch": 0.31741366642174873,
"grad_norm": 1.827301117863129,
"learning_rate": 7.990747572824253e-07,
"loss": 1.0342,
"step": 216
},
{
"epoch": 0.3188831741366642,
"grad_norm": 2.006041793599483,
"learning_rate": 7.971611811603803e-07,
"loss": 1.083,
"step": 217
},
{
"epoch": 0.3203526818515797,
"grad_norm": 1.8903539087686725,
"learning_rate": 7.952408516741607e-07,
"loss": 1.059,
"step": 218
},
{
"epoch": 0.3218221895664952,
"grad_norm": 1.8842212775756257,
"learning_rate": 7.933138124656864e-07,
"loss": 1.0677,
"step": 219
},
{
"epoch": 0.3232916972814107,
"grad_norm": 1.975112068311392,
"learning_rate": 7.913801073293638e-07,
"loss": 1.1073,
"step": 220
},
{
"epoch": 0.3247612049963262,
"grad_norm": 1.681430647589897,
"learning_rate": 7.894397802110908e-07,
"loss": 1.0563,
"step": 221
},
{
"epoch": 0.3262307127112417,
"grad_norm": 1.8380971719219996,
"learning_rate": 7.87492875207259e-07,
"loss": 1.1822,
"step": 222
},
{
"epoch": 0.32770022042615726,
"grad_norm": 1.7074365534791345,
"learning_rate": 7.855394365637495e-07,
"loss": 1.0594,
"step": 223
},
{
"epoch": 0.32916972814107276,
"grad_norm": 1.821964813971357,
"learning_rate": 7.835795086749299e-07,
"loss": 1.113,
"step": 224
},
{
"epoch": 0.33063923585598826,
"grad_norm": 1.8642253124188575,
"learning_rate": 7.816131360826434e-07,
"loss": 1.0596,
"step": 225
},
{
"epoch": 0.33210874357090375,
"grad_norm": 1.7175969911981157,
"learning_rate": 7.796403634751973e-07,
"loss": 1.053,
"step": 226
},
{
"epoch": 0.33357825128581925,
"grad_norm": 1.777915859356033,
"learning_rate": 7.776612356863477e-07,
"loss": 1.0065,
"step": 227
},
{
"epoch": 0.33504775900073475,
"grad_norm": 1.8085189246724207,
"learning_rate": 7.756757976942798e-07,
"loss": 1.0394,
"step": 228
},
{
"epoch": 0.33651726671565024,
"grad_norm": 1.8756745576407878,
"learning_rate": 7.736840946205865e-07,
"loss": 1.1273,
"step": 229
},
{
"epoch": 0.33798677443056574,
"grad_norm": 1.8327083341200932,
"learning_rate": 7.716861717292424e-07,
"loss": 1.0538,
"step": 230
},
{
"epoch": 0.33945628214548124,
"grad_norm": 1.8631431553314504,
"learning_rate": 7.696820744255756e-07,
"loss": 1.1507,
"step": 231
},
{
"epoch": 0.3409257898603968,
"grad_norm": 1.7465550716286358,
"learning_rate": 7.676718482552353e-07,
"loss": 1.0697,
"step": 232
},
{
"epoch": 0.3423952975753123,
"grad_norm": 1.7322371342891512,
"learning_rate": 7.65655538903157e-07,
"loss": 1.0302,
"step": 233
},
{
"epoch": 0.3438648052902278,
"grad_norm": 1.7814973341724534,
"learning_rate": 7.636331921925241e-07,
"loss": 1.0899,
"step": 234
},
{
"epoch": 0.3453343130051433,
"grad_norm": 1.835128046923307,
"learning_rate": 7.61604854083727e-07,
"loss": 1.0511,
"step": 235
},
{
"epoch": 0.3468038207200588,
"grad_norm": 1.890591777633699,
"learning_rate": 7.595705706733178e-07,
"loss": 1.0503,
"step": 236
},
{
"epoch": 0.34827332843497427,
"grad_norm": 1.802438897355832,
"learning_rate": 7.575303881929632e-07,
"loss": 1.1125,
"step": 237
},
{
"epoch": 0.34974283614988977,
"grad_norm": 1.7658119404997528,
"learning_rate": 7.55484353008394e-07,
"loss": 1.0737,
"step": 238
},
{
"epoch": 0.35121234386480527,
"grad_norm": 1.8011836883001782,
"learning_rate": 7.534325116183508e-07,
"loss": 1.0891,
"step": 239
},
{
"epoch": 0.3526818515797208,
"grad_norm": 1.7936027919741155,
"learning_rate": 7.513749106535278e-07,
"loss": 1.0433,
"step": 240
},
{
"epoch": 0.3541513592946363,
"grad_norm": 1.8017282268668782,
"learning_rate": 7.493115968755125e-07,
"loss": 1.047,
"step": 241
},
{
"epoch": 0.3556208670095518,
"grad_norm": 1.9342694434293286,
"learning_rate": 7.472426171757238e-07,
"loss": 1.098,
"step": 242
},
{
"epoch": 0.3570903747244673,
"grad_norm": 1.8077529691670307,
"learning_rate": 7.451680185743454e-07,
"loss": 1.0277,
"step": 243
},
{
"epoch": 0.3585598824393828,
"grad_norm": 1.8830252116300301,
"learning_rate": 7.430878482192579e-07,
"loss": 0.9846,
"step": 244
},
{
"epoch": 0.3600293901542983,
"grad_norm": 1.8998227561518721,
"learning_rate": 7.41002153384967e-07,
"loss": 1.0897,
"step": 245
},
{
"epoch": 0.3614988978692138,
"grad_norm": 1.8229424521327513,
"learning_rate": 7.389109814715292e-07,
"loss": 1.0718,
"step": 246
},
{
"epoch": 0.3629684055841293,
"grad_norm": 1.871640131350982,
"learning_rate": 7.368143800034745e-07,
"loss": 1.1105,
"step": 247
},
{
"epoch": 0.36443791329904485,
"grad_norm": 1.728622836313891,
"learning_rate": 7.347123966287265e-07,
"loss": 1.0658,
"step": 248
},
{
"epoch": 0.36590742101396034,
"grad_norm": 1.9339436433410142,
"learning_rate": 7.326050791175196e-07,
"loss": 1.1393,
"step": 249
},
{
"epoch": 0.36737692872887584,
"grad_norm": 1.716023610009682,
"learning_rate": 7.304924753613127e-07,
"loss": 1.0316,
"step": 250
},
{
"epoch": 0.36737692872887584,
"eval_ical_mcts_chains_sft_val_MORECHAINS_loss": 2.707226037979126,
"eval_ical_mcts_chains_sft_val_MORECHAINS_runtime": 17.8654,
"eval_ical_mcts_chains_sft_val_MORECHAINS_samples_per_second": 5.206,
"eval_ical_mcts_chains_sft_val_MORECHAINS_steps_per_second": 0.672,
"step": 250
},
{
"epoch": 0.36884643644379134,
"grad_norm": 1.7391519225856011,
"learning_rate": 7.283746333717014e-07,
"loss": 1.0377,
"step": 251
},
{
"epoch": 0.37031594415870683,
"grad_norm": 1.818410985977693,
"learning_rate": 7.262516012793276e-07,
"loss": 1.0345,
"step": 252
},
{
"epoch": 0.37178545187362233,
"grad_norm": 1.8812075301481235,
"learning_rate": 7.241234273327838e-07,
"loss": 1.0111,
"step": 253
},
{
"epoch": 0.3732549595885378,
"grad_norm": 1.8256426567686606,
"learning_rate": 7.219901598975185e-07,
"loss": 1.0242,
"step": 254
},
{
"epoch": 0.3747244673034533,
"grad_norm": 1.8032448771328675,
"learning_rate": 7.198518474547354e-07,
"loss": 0.9857,
"step": 255
},
{
"epoch": 0.3761939750183688,
"grad_norm": 1.8445748755872284,
"learning_rate": 7.17708538600293e-07,
"loss": 1.0593,
"step": 256
},
{
"epoch": 0.3776634827332844,
"grad_norm": 1.8408999808307878,
"learning_rate": 7.155602820435992e-07,
"loss": 1.0839,
"step": 257
},
{
"epoch": 0.37913299044819987,
"grad_norm": 1.8091016805737823,
"learning_rate": 7.134071266065051e-07,
"loss": 1.058,
"step": 258
},
{
"epoch": 0.38060249816311537,
"grad_norm": 1.8529955273165009,
"learning_rate": 7.112491212221946e-07,
"loss": 1.0021,
"step": 259
},
{
"epoch": 0.38207200587803086,
"grad_norm": 1.8092464332644904,
"learning_rate": 7.09086314934073e-07,
"loss": 1.0064,
"step": 260
},
{
"epoch": 0.38354151359294636,
"grad_norm": 1.795984690847929,
"learning_rate": 7.069187568946524e-07,
"loss": 1.0654,
"step": 261
},
{
"epoch": 0.38501102130786186,
"grad_norm": 1.7737795304586095,
"learning_rate": 7.047464963644342e-07,
"loss": 1.0151,
"step": 262
},
{
"epoch": 0.38648052902277735,
"grad_norm": 1.8528676449577293,
"learning_rate": 7.025695827107901e-07,
"loss": 1.0804,
"step": 263
},
{
"epoch": 0.38795003673769285,
"grad_norm": 1.8476585972521806,
"learning_rate": 7.003880654068395e-07,
"loss": 1.091,
"step": 264
},
{
"epoch": 0.3894195444526084,
"grad_norm": 1.9052486545665286,
"learning_rate": 6.98201994030326e-07,
"loss": 1.0933,
"step": 265
},
{
"epoch": 0.3908890521675239,
"grad_norm": 1.778971704628668,
"learning_rate": 6.960114182624902e-07,
"loss": 1.1075,
"step": 266
},
{
"epoch": 0.3923585598824394,
"grad_norm": 1.7435161249168551,
"learning_rate": 6.938163878869405e-07,
"loss": 1.0052,
"step": 267
},
{
"epoch": 0.3938280675973549,
"grad_norm": 1.7315215450330044,
"learning_rate": 6.916169527885221e-07,
"loss": 1.0512,
"step": 268
},
{
"epoch": 0.3952975753122704,
"grad_norm": 1.7876056692528195,
"learning_rate": 6.894131629521829e-07,
"loss": 1.1442,
"step": 269
},
{
"epoch": 0.3967670830271859,
"grad_norm": 1.7710225516067628,
"learning_rate": 6.872050684618381e-07,
"loss": 1.081,
"step": 270
},
{
"epoch": 0.3982365907421014,
"grad_norm": 1.7329335705128692,
"learning_rate": 6.849927194992312e-07,
"loss": 0.9969,
"step": 271
},
{
"epoch": 0.3997060984570169,
"grad_norm": 1.8592291193285087,
"learning_rate": 6.827761663427943e-07,
"loss": 1.1048,
"step": 272
},
{
"epoch": 0.4011756061719324,
"grad_norm": 1.795673055617955,
"learning_rate": 6.805554593665049e-07,
"loss": 1.1164,
"step": 273
},
{
"epoch": 0.4026451138868479,
"grad_norm": 1.7367854154136189,
"learning_rate": 6.783306490387414e-07,
"loss": 0.9914,
"step": 274
},
{
"epoch": 0.4041146216017634,
"grad_norm": 1.7004321271293306,
"learning_rate": 6.761017859211359e-07,
"loss": 1.0202,
"step": 275
},
{
"epoch": 0.4055841293166789,
"grad_norm": 1.821764236324883,
"learning_rate": 6.738689206674257e-07,
"loss": 0.9857,
"step": 276
},
{
"epoch": 0.4070536370315944,
"grad_norm": 1.7847774281240236,
"learning_rate": 6.716321040223014e-07,
"loss": 1.0136,
"step": 277
},
{
"epoch": 0.4085231447465099,
"grad_norm": 1.8328549796172011,
"learning_rate": 6.693913868202539e-07,
"loss": 1.0137,
"step": 278
},
{
"epoch": 0.4099926524614254,
"grad_norm": 1.6946865477492625,
"learning_rate": 6.671468199844192e-07,
"loss": 1.0017,
"step": 279
},
{
"epoch": 0.4114621601763409,
"grad_norm": 1.7971440056715982,
"learning_rate": 6.648984545254216e-07,
"loss": 1.0347,
"step": 280
},
{
"epoch": 0.4129316678912564,
"grad_norm": 1.7300255825420612,
"learning_rate": 6.626463415402131e-07,
"loss": 1.0327,
"step": 281
},
{
"epoch": 0.41440117560617196,
"grad_norm": 1.7175987687087009,
"learning_rate": 6.603905322109138e-07,
"loss": 1.0332,
"step": 282
},
{
"epoch": 0.41587068332108745,
"grad_norm": 1.8379257145846943,
"learning_rate": 6.581310778036474e-07,
"loss": 0.8794,
"step": 283
},
{
"epoch": 0.41734019103600295,
"grad_norm": 1.8820137872210625,
"learning_rate": 6.558680296673766e-07,
"loss": 1.0813,
"step": 284
},
{
"epoch": 0.41880969875091845,
"grad_norm": 1.7896323310359439,
"learning_rate": 6.536014392327365e-07,
"loss": 1.0797,
"step": 285
},
{
"epoch": 0.42027920646583394,
"grad_norm": 1.8676894344858546,
"learning_rate": 6.51331358010865e-07,
"loss": 1.0154,
"step": 286
},
{
"epoch": 0.42174871418074944,
"grad_norm": 1.9247091383586632,
"learning_rate": 6.490578375922328e-07,
"loss": 1.0886,
"step": 287
},
{
"epoch": 0.42321822189566494,
"grad_norm": 1.9871628498039662,
"learning_rate": 6.467809296454708e-07,
"loss": 1.0183,
"step": 288
},
{
"epoch": 0.42468772961058043,
"grad_norm": 1.80768668341203,
"learning_rate": 6.445006859161956e-07,
"loss": 1.1154,
"step": 289
},
{
"epoch": 0.426157237325496,
"grad_norm": 1.7983774844966378,
"learning_rate": 6.422171582258334e-07,
"loss": 1.0147,
"step": 290
},
{
"epoch": 0.4276267450404115,
"grad_norm": 1.7722882920293253,
"learning_rate": 6.399303984704432e-07,
"loss": 1.0592,
"step": 291
},
{
"epoch": 0.429096252755327,
"grad_norm": 1.8457372162292673,
"learning_rate": 6.376404586195364e-07,
"loss": 0.9973,
"step": 292
},
{
"epoch": 0.4305657604702425,
"grad_norm": 1.895053197799994,
"learning_rate": 6.353473907148961e-07,
"loss": 1.0714,
"step": 293
},
{
"epoch": 0.432035268185158,
"grad_norm": 1.7517450211480494,
"learning_rate": 6.330512468693944e-07,
"loss": 1.0012,
"step": 294
},
{
"epoch": 0.43350477590007347,
"grad_norm": 1.7629299026170842,
"learning_rate": 6.307520792658081e-07,
"loss": 1.0542,
"step": 295
},
{
"epoch": 0.43497428361498897,
"grad_norm": 1.8006758213208047,
"learning_rate": 6.284499401556328e-07,
"loss": 1.1121,
"step": 296
},
{
"epoch": 0.43644379132990446,
"grad_norm": 1.8081212325920115,
"learning_rate": 6.261448818578952e-07,
"loss": 1.0156,
"step": 297
},
{
"epoch": 0.43791329904481996,
"grad_norm": 1.7825996298491462,
"learning_rate": 6.238369567579642e-07,
"loss": 1.0645,
"step": 298
},
{
"epoch": 0.4393828067597355,
"grad_norm": 1.8083690724086425,
"learning_rate": 6.215262173063607e-07,
"loss": 1.0702,
"step": 299
},
{
"epoch": 0.440852314474651,
"grad_norm": 1.931050770675179,
"learning_rate": 6.192127160175649e-07,
"loss": 1.0715,
"step": 300
},
{
"epoch": 0.4423218221895665,
"grad_norm": 1.7810570021280987,
"learning_rate": 6.168965054688238e-07,
"loss": 1.0486,
"step": 301
},
{
"epoch": 0.443791329904482,
"grad_norm": 1.8207804972547699,
"learning_rate": 6.145776382989552e-07,
"loss": 1.0234,
"step": 302
},
{
"epoch": 0.4452608376193975,
"grad_norm": 1.9112996054609888,
"learning_rate": 6.122561672071521e-07,
"loss": 0.9732,
"step": 303
},
{
"epoch": 0.446730345334313,
"grad_norm": 1.8392870095393898,
"learning_rate": 6.099321449517851e-07,
"loss": 1.0694,
"step": 304
},
{
"epoch": 0.4481998530492285,
"grad_norm": 1.746344158928944,
"learning_rate": 6.076056243492035e-07,
"loss": 1.0584,
"step": 305
},
{
"epoch": 0.449669360764144,
"grad_norm": 1.8625991708058884,
"learning_rate": 6.052766582725339e-07,
"loss": 0.9656,
"step": 306
},
{
"epoch": 0.45113886847905954,
"grad_norm": 1.8082814780076306,
"learning_rate": 6.029452996504801e-07,
"loss": 1.0642,
"step": 307
},
{
"epoch": 0.45260837619397504,
"grad_norm": 1.7985254132681678,
"learning_rate": 6.006116014661191e-07,
"loss": 1.1043,
"step": 308
},
{
"epoch": 0.45407788390889053,
"grad_norm": 1.787084218039004,
"learning_rate": 5.982756167556978e-07,
"loss": 0.9985,
"step": 309
},
{
"epoch": 0.45554739162380603,
"grad_norm": 1.866387388466877,
"learning_rate": 5.959373986074269e-07,
"loss": 1.0453,
"step": 310
},
{
"epoch": 0.4570168993387215,
"grad_norm": 1.7953700540717752,
"learning_rate": 5.935970001602751e-07,
"loss": 0.994,
"step": 311
},
{
"epoch": 0.458486407053637,
"grad_norm": 1.6903793992507545,
"learning_rate": 5.912544746027612e-07,
"loss": 1.0204,
"step": 312
},
{
"epoch": 0.4599559147685525,
"grad_norm": 1.7894140258608537,
"learning_rate": 5.88909875171745e-07,
"loss": 1.0958,
"step": 313
},
{
"epoch": 0.461425422483468,
"grad_norm": 1.919168721100207,
"learning_rate": 5.865632551512175e-07,
"loss": 0.9804,
"step": 314
},
{
"epoch": 0.4628949301983835,
"grad_norm": 1.6965567076432635,
"learning_rate": 5.842146678710911e-07,
"loss": 1.0335,
"step": 315
},
{
"epoch": 0.46436443791329907,
"grad_norm": 1.8629208572600364,
"learning_rate": 5.818641667059856e-07,
"loss": 1.0513,
"step": 316
},
{
"epoch": 0.46583394562821456,
"grad_norm": 2.0773647110529296,
"learning_rate": 5.795118050740169e-07,
"loss": 1.0175,
"step": 317
},
{
"epoch": 0.46730345334313006,
"grad_norm": 1.8282782276172471,
"learning_rate": 5.771576364355819e-07,
"loss": 1.0706,
"step": 318
},
{
"epoch": 0.46877296105804556,
"grad_norm": 1.8215206601821508,
"learning_rate": 5.748017142921448e-07,
"loss": 1.1167,
"step": 319
},
{
"epoch": 0.47024246877296105,
"grad_norm": 1.7965481050358185,
"learning_rate": 5.724440921850195e-07,
"loss": 1.1283,
"step": 320
},
{
"epoch": 0.47171197648787655,
"grad_norm": 1.9457437301973206,
"learning_rate": 5.700848236941543e-07,
"loss": 1.0308,
"step": 321
},
{
"epoch": 0.47318148420279205,
"grad_norm": 1.834038700326329,
"learning_rate": 5.677239624369134e-07,
"loss": 0.9875,
"step": 322
},
{
"epoch": 0.47465099191770754,
"grad_norm": 1.8030898471332173,
"learning_rate": 5.653615620668589e-07,
"loss": 1.0106,
"step": 323
},
{
"epoch": 0.4761204996326231,
"grad_norm": 1.9350537183535073,
"learning_rate": 5.629976762725307e-07,
"loss": 0.9902,
"step": 324
},
{
"epoch": 0.4775900073475386,
"grad_norm": 1.8510351646518606,
"learning_rate": 5.606323587762275e-07,
"loss": 1.2716,
"step": 325
},
{
"epoch": 0.4790595150624541,
"grad_norm": 1.8453958565296336,
"learning_rate": 5.582656633327848e-07,
"loss": 1.0647,
"step": 326
},
{
"epoch": 0.4805290227773696,
"grad_norm": 1.8119764316370859,
"learning_rate": 5.558976437283535e-07,
"loss": 1.0782,
"step": 327
},
{
"epoch": 0.4819985304922851,
"grad_norm": 1.757616643542913,
"learning_rate": 5.535283537791785e-07,
"loss": 1.085,
"step": 328
},
{
"epoch": 0.4834680382072006,
"grad_norm": 1.8024345990236037,
"learning_rate": 5.511578473303742e-07,
"loss": 1.062,
"step": 329
},
{
"epoch": 0.4849375459221161,
"grad_norm": 1.79520109860375,
"learning_rate": 5.487861782547017e-07,
"loss": 1.0857,
"step": 330
},
{
"epoch": 0.4864070536370316,
"grad_norm": 1.7467294013133847,
"learning_rate": 5.464134004513442e-07,
"loss": 1.0852,
"step": 331
},
{
"epoch": 0.4878765613519471,
"grad_norm": 1.762768838854569,
"learning_rate": 5.440395678446825e-07,
"loss": 1.0764,
"step": 332
},
{
"epoch": 0.4893460690668626,
"grad_norm": 1.8369132895032922,
"learning_rate": 5.416647343830687e-07,
"loss": 1.0335,
"step": 333
},
{
"epoch": 0.4908155767817781,
"grad_norm": 1.7227201967047996,
"learning_rate": 5.392889540376006e-07,
"loss": 1.0121,
"step": 334
},
{
"epoch": 0.4922850844966936,
"grad_norm": 1.8514670496237804,
"learning_rate": 5.369122808008955e-07,
"loss": 1.0267,
"step": 335
},
{
"epoch": 0.4937545922116091,
"grad_norm": 1.8281145295077228,
"learning_rate": 5.345347686858626e-07,
"loss": 1.0312,
"step": 336
},
{
"epoch": 0.4952240999265246,
"grad_norm": 2.0505787508652165,
"learning_rate": 5.321564717244757e-07,
"loss": 1.0363,
"step": 337
},
{
"epoch": 0.4966936076414401,
"grad_norm": 1.9881421461034916,
"learning_rate": 5.297774439665449e-07,
"loss": 1.0525,
"step": 338
},
{
"epoch": 0.4981631153563556,
"grad_norm": 1.7909173829595026,
"learning_rate": 5.273977394784892e-07,
"loss": 1.0272,
"step": 339
},
{
"epoch": 0.4996326230712711,
"grad_norm": 1.8358490715566034,
"learning_rate": 5.250174123421068e-07,
"loss": 1.1935,
"step": 340
},
{
"epoch": 0.5011021307861866,
"grad_norm": 1.821995825437999,
"learning_rate": 5.226365166533458e-07,
"loss": 1.0467,
"step": 341
},
{
"epoch": 0.5025716385011021,
"grad_norm": 1.7931454394377604,
"learning_rate": 5.202551065210768e-07,
"loss": 0.991,
"step": 342
},
{
"epoch": 0.5040411462160176,
"grad_norm": 1.7628339353097855,
"learning_rate": 5.178732360658605e-07,
"loss": 1.063,
"step": 343
},
{
"epoch": 0.5055106539309331,
"grad_norm": 1.8147261888211064,
"learning_rate": 5.154909594187192e-07,
"loss": 1.055,
"step": 344
},
{
"epoch": 0.5069801616458487,
"grad_norm": 1.7863871342410091,
"learning_rate": 5.131083307199071e-07,
"loss": 1.0626,
"step": 345
},
{
"epoch": 0.5084496693607642,
"grad_norm": 1.730735975505189,
"learning_rate": 5.107254041176788e-07,
"loss": 1.0818,
"step": 346
},
{
"epoch": 0.5099191770756797,
"grad_norm": 1.7642264640801142,
"learning_rate": 5.08342233767059e-07,
"loss": 1.0145,
"step": 347
},
{
"epoch": 0.5113886847905952,
"grad_norm": 1.8089809513102568,
"learning_rate": 5.059588738286118e-07,
"loss": 1.1006,
"step": 348
},
{
"epoch": 0.5128581925055107,
"grad_norm": 1.8302210019412795,
"learning_rate": 5.035753784672105e-07,
"loss": 1.047,
"step": 349
},
{
"epoch": 0.5143277002204262,
"grad_norm": 1.9024588115083674,
"learning_rate": 5.011918018508057e-07,
"loss": 1.0686,
"step": 350
},
{
"epoch": 0.5157972079353417,
"grad_norm": 1.759141919781444,
"learning_rate": 4.988081981491944e-07,
"loss": 1.018,
"step": 351
},
{
"epoch": 0.5172667156502572,
"grad_norm": 1.889727689838186,
"learning_rate": 4.964246215327894e-07,
"loss": 1.1436,
"step": 352
},
{
"epoch": 0.5187362233651727,
"grad_norm": 1.7456126845475135,
"learning_rate": 4.940411261713882e-07,
"loss": 1.015,
"step": 353
},
{
"epoch": 0.5202057310800882,
"grad_norm": 1.7664326346015127,
"learning_rate": 4.91657766232941e-07,
"loss": 1.09,
"step": 354
},
{
"epoch": 0.5216752387950037,
"grad_norm": 1.7222941541710777,
"learning_rate": 4.892745958823213e-07,
"loss": 1.0243,
"step": 355
},
{
"epoch": 0.5231447465099192,
"grad_norm": 1.7157057108244869,
"learning_rate": 4.868916692800928e-07,
"loss": 1.0408,
"step": 356
},
{
"epoch": 0.5246142542248347,
"grad_norm": 1.824734949262758,
"learning_rate": 4.845090405812809e-07,
"loss": 1.0149,
"step": 357
},
{
"epoch": 0.5260837619397501,
"grad_norm": 1.8249245650143116,
"learning_rate": 4.821267639341397e-07,
"loss": 0.9811,
"step": 358
},
{
"epoch": 0.5275532696546656,
"grad_norm": 1.7495581100985684,
"learning_rate": 4.797448934789232e-07,
"loss": 1.0158,
"step": 359
},
{
"epoch": 0.5290227773695811,
"grad_norm": 1.773554088711297,
"learning_rate": 4.773634833466541e-07,
"loss": 0.9899,
"step": 360
},
{
"epoch": 0.5304922850844966,
"grad_norm": 1.7708057357465794,
"learning_rate": 4.7498258765789335e-07,
"loss": 0.9971,
"step": 361
},
{
"epoch": 0.5319617927994122,
"grad_norm": 1.8963492839232228,
"learning_rate": 4.726022605215108e-07,
"loss": 1.1343,
"step": 362
},
{
"epoch": 0.5334313005143277,
"grad_norm": 1.8915073493453016,
"learning_rate": 4.7022255603345504e-07,
"loss": 0.9819,
"step": 363
},
{
"epoch": 0.5349008082292432,
"grad_norm": 1.876306774074019,
"learning_rate": 4.6784352827552433e-07,
"loss": 1.0328,
"step": 364
},
{
"epoch": 0.5363703159441587,
"grad_norm": 1.7402642886140849,
"learning_rate": 4.6546523131413737e-07,
"loss": 1.0202,
"step": 365
},
{
"epoch": 0.5378398236590742,
"grad_norm": 1.766894077634918,
"learning_rate": 4.6308771919910455e-07,
"loss": 1.0183,
"step": 366
},
{
"epoch": 0.5393093313739897,
"grad_norm": 1.8710398967234652,
"learning_rate": 4.607110459623994e-07,
"loss": 0.9959,
"step": 367
},
{
"epoch": 0.5407788390889052,
"grad_norm": 1.8609893131836714,
"learning_rate": 4.5833526561693146e-07,
"loss": 1.0279,
"step": 368
},
{
"epoch": 0.5422483468038207,
"grad_norm": 1.8161838491547273,
"learning_rate": 4.559604321553176e-07,
"loss": 1.0187,
"step": 369
},
{
"epoch": 0.5437178545187362,
"grad_norm": 1.7513215748849267,
"learning_rate": 4.535865995486559e-07,
"loss": 1.0668,
"step": 370
},
{
"epoch": 0.5451873622336517,
"grad_norm": 1.9792905956550824,
"learning_rate": 4.512138217452984e-07,
"loss": 0.9754,
"step": 371
},
{
"epoch": 0.5466568699485672,
"grad_norm": 1.8313364606031188,
"learning_rate": 4.488421526696259e-07,
"loss": 1.027,
"step": 372
},
{
"epoch": 0.5481263776634827,
"grad_norm": 1.7125757541553044,
"learning_rate": 4.464716462208216e-07,
"loss": 0.9887,
"step": 373
},
{
"epoch": 0.5495958853783982,
"grad_norm": 1.8168674762154746,
"learning_rate": 4.441023562716464e-07,
"loss": 1.0634,
"step": 374
},
{
"epoch": 0.5510653930933137,
"grad_norm": 1.7943803054480878,
"learning_rate": 4.417343366672154e-07,
"loss": 0.9982,
"step": 375
},
{
"epoch": 0.5525349008082292,
"grad_norm": 1.8144203745899334,
"learning_rate": 4.393676412237726e-07,
"loss": 1.0639,
"step": 376
},
{
"epoch": 0.5540044085231447,
"grad_norm": 1.8410394383068294,
"learning_rate": 4.370023237274693e-07,
"loss": 0.9708,
"step": 377
},
{
"epoch": 0.5554739162380602,
"grad_norm": 1.8238414471621205,
"learning_rate": 4.3463843793314123e-07,
"loss": 0.9859,
"step": 378
},
{
"epoch": 0.5569434239529758,
"grad_norm": 1.8116442983581678,
"learning_rate": 4.322760375630867e-07,
"loss": 1.1102,
"step": 379
},
{
"epoch": 0.5584129316678913,
"grad_norm": 1.709109580058877,
"learning_rate": 4.299151763058457e-07,
"loss": 1.0817,
"step": 380
},
{
"epoch": 0.5598824393828068,
"grad_norm": 1.8180233791321092,
"learning_rate": 4.2755590781498056e-07,
"loss": 0.9678,
"step": 381
},
{
"epoch": 0.5613519470977223,
"grad_norm": 1.748975999116001,
"learning_rate": 4.251982857078553e-07,
"loss": 0.9776,
"step": 382
},
{
"epoch": 0.5628214548126378,
"grad_norm": 1.9292369702259817,
"learning_rate": 4.2284236356441817e-07,
"loss": 1.0483,
"step": 383
},
{
"epoch": 0.5642909625275533,
"grad_norm": 2.01824515708808,
"learning_rate": 4.204881949259832e-07,
"loss": 1.1453,
"step": 384
},
{
"epoch": 0.5657604702424688,
"grad_norm": 1.7138713007750266,
"learning_rate": 4.181358332940144e-07,
"loss": 1.0395,
"step": 385
},
{
"epoch": 0.5672299779573843,
"grad_norm": 1.821190672284049,
"learning_rate": 4.157853321289089e-07,
"loss": 1.0557,
"step": 386
},
{
"epoch": 0.5686994856722998,
"grad_norm": 1.8426132732229954,
"learning_rate": 4.1343674484878236e-07,
"loss": 1.0369,
"step": 387
},
{
"epoch": 0.5701689933872153,
"grad_norm": 1.8067549348583807,
"learning_rate": 4.11090124828255e-07,
"loss": 1.0121,
"step": 388
},
{
"epoch": 0.5716385011021308,
"grad_norm": 1.7768823877409583,
"learning_rate": 4.0874552539723873e-07,
"loss": 1.0319,
"step": 389
},
{
"epoch": 0.5731080088170463,
"grad_norm": 1.9410827099670018,
"learning_rate": 4.064029998397247e-07,
"loss": 1.0548,
"step": 390
},
{
"epoch": 0.5745775165319618,
"grad_norm": 1.7974497316933307,
"learning_rate": 4.04062601392573e-07,
"loss": 1.0468,
"step": 391
},
{
"epoch": 0.5760470242468773,
"grad_norm": 1.814540333924481,
"learning_rate": 4.017243832443021e-07,
"loss": 1.0443,
"step": 392
},
{
"epoch": 0.5775165319617928,
"grad_norm": 1.7702673169768615,
"learning_rate": 3.993883985338808e-07,
"loss": 1.0199,
"step": 393
},
{
"epoch": 0.5789860396767083,
"grad_norm": 1.6726234345970885,
"learning_rate": 3.9705470034951986e-07,
"loss": 1.0008,
"step": 394
},
{
"epoch": 0.5804555473916239,
"grad_norm": 1.7525297959261115,
"learning_rate": 3.9472334172746596e-07,
"loss": 1.0525,
"step": 395
},
{
"epoch": 0.5819250551065394,
"grad_norm": 1.8829571527912472,
"learning_rate": 3.9239437565079645e-07,
"loss": 1.0586,
"step": 396
},
{
"epoch": 0.5833945628214549,
"grad_norm": 1.8402564713547034,
"learning_rate": 3.900678550482147e-07,
"loss": 1.0469,
"step": 397
},
{
"epoch": 0.5848640705363704,
"grad_norm": 1.7484377822738735,
"learning_rate": 3.877438327928478e-07,
"loss": 1.0515,
"step": 398
},
{
"epoch": 0.5863335782512858,
"grad_norm": 1.7734813614367146,
"learning_rate": 3.854223617010448e-07,
"loss": 0.9925,
"step": 399
},
{
"epoch": 0.5878030859662013,
"grad_norm": 1.8437569756188645,
"learning_rate": 3.8310349453117617e-07,
"loss": 1.0257,
"step": 400
},
{
"epoch": 0.5892725936811168,
"grad_norm": 1.7695848999552912,
"learning_rate": 3.8078728398243503e-07,
"loss": 1.0602,
"step": 401
},
{
"epoch": 0.5907421013960323,
"grad_norm": 1.812644515523391,
"learning_rate": 3.784737826936393e-07,
"loss": 1.0012,
"step": 402
},
{
"epoch": 0.5922116091109478,
"grad_norm": 1.7016974236723055,
"learning_rate": 3.761630432420358e-07,
"loss": 0.9757,
"step": 403
},
{
"epoch": 0.5936811168258633,
"grad_norm": 1.8251040601296213,
"learning_rate": 3.7385511814210493e-07,
"loss": 1.0435,
"step": 404
},
{
"epoch": 0.5951506245407788,
"grad_norm": 1.882206985801385,
"learning_rate": 3.715500598443672e-07,
"loss": 1.0125,
"step": 405
},
{
"epoch": 0.5966201322556943,
"grad_norm": 1.700935158915743,
"learning_rate": 3.6924792073419193e-07,
"loss": 1.0304,
"step": 406
},
{
"epoch": 0.5980896399706098,
"grad_norm": 1.821591367605163,
"learning_rate": 3.6694875313060567e-07,
"loss": 1.1047,
"step": 407
},
{
"epoch": 0.5995591476855253,
"grad_norm": 1.9222790614713745,
"learning_rate": 3.646526092851039e-07,
"loss": 1.1109,
"step": 408
},
{
"epoch": 0.6010286554004408,
"grad_norm": 1.7353537741399607,
"learning_rate": 3.623595413804636e-07,
"loss": 1.0393,
"step": 409
},
{
"epoch": 0.6024981631153563,
"grad_norm": 1.7671934350452785,
"learning_rate": 3.600696015295568e-07,
"loss": 1.023,
"step": 410
},
{
"epoch": 0.6039676708302718,
"grad_norm": 1.8243296781220124,
"learning_rate": 3.577828417741665e-07,
"loss": 1.0938,
"step": 411
},
{
"epoch": 0.6054371785451874,
"grad_norm": 1.7808942851487597,
"learning_rate": 3.5549931408380446e-07,
"loss": 1.049,
"step": 412
},
{
"epoch": 0.6069066862601029,
"grad_norm": 1.7751176328929308,
"learning_rate": 3.5321907035452913e-07,
"loss": 0.995,
"step": 413
},
{
"epoch": 0.6083761939750184,
"grad_norm": 1.80138825273579,
"learning_rate": 3.509421624077672e-07,
"loss": 1.0378,
"step": 414
},
{
"epoch": 0.6098457016899339,
"grad_norm": 1.734060593988662,
"learning_rate": 3.486686419891349e-07,
"loss": 0.973,
"step": 415
},
{
"epoch": 0.6113152094048494,
"grad_norm": 1.814138504869964,
"learning_rate": 3.4639856076726346e-07,
"loss": 1.0429,
"step": 416
},
{
"epoch": 0.6127847171197649,
"grad_norm": 1.8227189292521806,
"learning_rate": 3.4413197033262343e-07,
"loss": 1.0348,
"step": 417
},
{
"epoch": 0.6142542248346804,
"grad_norm": 1.7945450128155505,
"learning_rate": 3.4186892219635254e-07,
"loss": 1.0017,
"step": 418
},
{
"epoch": 0.6157237325495959,
"grad_norm": 1.8233464040803875,
"learning_rate": 3.396094677890862e-07,
"loss": 1.0457,
"step": 419
},
{
"epoch": 0.6171932402645114,
"grad_norm": 1.8995720630920003,
"learning_rate": 3.373536584597869e-07,
"loss": 0.9766,
"step": 420
},
{
"epoch": 0.6186627479794269,
"grad_norm": 1.7848342121334433,
"learning_rate": 3.3510154547457845e-07,
"loss": 1.0378,
"step": 421
},
{
"epoch": 0.6201322556943424,
"grad_norm": 1.7452555853428344,
"learning_rate": 3.3285318001558076e-07,
"loss": 1.0456,
"step": 422
},
{
"epoch": 0.6216017634092579,
"grad_norm": 1.7706320471133385,
"learning_rate": 3.306086131797462e-07,
"loss": 1.028,
"step": 423
},
{
"epoch": 0.6230712711241734,
"grad_norm": 1.8287154878240726,
"learning_rate": 3.283678959776986e-07,
"loss": 1.0694,
"step": 424
},
{
"epoch": 0.6245407788390889,
"grad_norm": 1.9361160647239,
"learning_rate": 3.261310793325742e-07,
"loss": 1.0285,
"step": 425
},
{
"epoch": 0.6260102865540044,
"grad_norm": 1.7593247945393256,
"learning_rate": 3.23898214078864e-07,
"loss": 1.0826,
"step": 426
},
{
"epoch": 0.6274797942689199,
"grad_norm": 1.9132628423312081,
"learning_rate": 3.216693509612587e-07,
"loss": 1.0267,
"step": 427
},
{
"epoch": 0.6289493019838354,
"grad_norm": 1.8484127698802801,
"learning_rate": 3.19444540633495e-07,
"loss": 1.0718,
"step": 428
},
{
"epoch": 0.630418809698751,
"grad_norm": 1.8916521672529822,
"learning_rate": 3.172238336572056e-07,
"loss": 1.157,
"step": 429
},
{
"epoch": 0.6318883174136665,
"grad_norm": 2.9742222052103013,
"learning_rate": 3.1500728050076873e-07,
"loss": 1.0119,
"step": 430
},
{
"epoch": 0.633357825128582,
"grad_norm": 1.7337670893421475,
"learning_rate": 3.1279493153816183e-07,
"loss": 0.9856,
"step": 431
},
{
"epoch": 0.6348273328434975,
"grad_norm": 1.7909370519049963,
"learning_rate": 3.1058683704781707e-07,
"loss": 0.9805,
"step": 432
},
{
"epoch": 0.636296840558413,
"grad_norm": 1.7354057723295175,
"learning_rate": 3.0838304721147803e-07,
"loss": 1.0015,
"step": 433
},
{
"epoch": 0.6377663482733285,
"grad_norm": 1.7551304317860974,
"learning_rate": 3.0618361211305956e-07,
"loss": 1.1138,
"step": 434
},
{
"epoch": 0.639235855988244,
"grad_norm": 1.8366440682808711,
"learning_rate": 3.0398858173750994e-07,
"loss": 1.0614,
"step": 435
},
{
"epoch": 0.6407053637031594,
"grad_norm": 1.7891758756508314,
"learning_rate": 3.0179800596967414e-07,
"loss": 1.084,
"step": 436
},
{
"epoch": 0.6421748714180749,
"grad_norm": 1.7187905850769671,
"learning_rate": 2.996119345931607e-07,
"loss": 1.0292,
"step": 437
},
{
"epoch": 0.6436443791329904,
"grad_norm": 1.740058780022097,
"learning_rate": 2.9743041728921004e-07,
"loss": 0.9946,
"step": 438
},
{
"epoch": 0.6451138868479059,
"grad_norm": 1.9099618734962607,
"learning_rate": 2.952535036355659e-07,
"loss": 0.9565,
"step": 439
},
{
"epoch": 0.6465833945628214,
"grad_norm": 1.7980156429513823,
"learning_rate": 2.930812431053477e-07,
"loss": 1.1037,
"step": 440
},
{
"epoch": 0.6480529022777369,
"grad_norm": 1.813220963862845,
"learning_rate": 2.9091368506592704e-07,
"loss": 1.0499,
"step": 441
},
{
"epoch": 0.6495224099926524,
"grad_norm": 1.772153448992575,
"learning_rate": 2.8875087877780547e-07,
"loss": 0.9447,
"step": 442
},
{
"epoch": 0.6509919177075679,
"grad_norm": 1.7658892705830358,
"learning_rate": 2.865928733934951e-07,
"loss": 1.0119,
"step": 443
},
{
"epoch": 0.6524614254224834,
"grad_norm": 1.847269826581132,
"learning_rate": 2.844397179564009e-07,
"loss": 1.0514,
"step": 444
},
{
"epoch": 0.6539309331373989,
"grad_norm": 1.746147021855409,
"learning_rate": 2.8229146139970725e-07,
"loss": 1.0209,
"step": 445
},
{
"epoch": 0.6554004408523145,
"grad_norm": 1.732044566838772,
"learning_rate": 2.8014815254526475e-07,
"loss": 0.9906,
"step": 446
},
{
"epoch": 0.65686994856723,
"grad_norm": 1.8978735745750135,
"learning_rate": 2.780098401024816e-07,
"loss": 1.0946,
"step": 447
},
{
"epoch": 0.6583394562821455,
"grad_norm": 1.87807485244721,
"learning_rate": 2.7587657266721633e-07,
"loss": 0.9462,
"step": 448
},
{
"epoch": 0.659808963997061,
"grad_norm": 1.7847823162110357,
"learning_rate": 2.737483987206725e-07,
"loss": 0.9834,
"step": 449
},
{
"epoch": 0.6612784717119765,
"grad_norm": 1.7587621212176707,
"learning_rate": 2.7162536662829836e-07,
"loss": 0.9779,
"step": 450
},
{
"epoch": 0.662747979426892,
"grad_norm": 1.774690627426415,
"learning_rate": 2.695075246386874e-07,
"loss": 0.977,
"step": 451
},
{
"epoch": 0.6642174871418075,
"grad_norm": 1.7853964662457704,
"learning_rate": 2.673949208824804e-07,
"loss": 0.9579,
"step": 452
},
{
"epoch": 0.665686994856723,
"grad_norm": 1.813872909885941,
"learning_rate": 2.6528760337127344e-07,
"loss": 1.0073,
"step": 453
},
{
"epoch": 0.6671565025716385,
"grad_norm": 1.7502967135919663,
"learning_rate": 2.6318561999652543e-07,
"loss": 1.0442,
"step": 454
},
{
"epoch": 0.668626010286554,
"grad_norm": 1.8053794650101893,
"learning_rate": 2.610890185284707e-07,
"loss": 1.0947,
"step": 455
},
{
"epoch": 0.6700955180014695,
"grad_norm": 1.7729452406831345,
"learning_rate": 2.5899784661503306e-07,
"loss": 1.0961,
"step": 456
},
{
"epoch": 0.671565025716385,
"grad_norm": 1.7354885528303905,
"learning_rate": 2.569121517807421e-07,
"loss": 1.013,
"step": 457
},
{
"epoch": 0.6730345334313005,
"grad_norm": 1.7765708056979728,
"learning_rate": 2.5483198142565454e-07,
"loss": 1.0584,
"step": 458
},
{
"epoch": 0.674504041146216,
"grad_norm": 1.7791464282680176,
"learning_rate": 2.5275738282427627e-07,
"loss": 1.098,
"step": 459
},
{
"epoch": 0.6759735488611315,
"grad_norm": 1.7637882278681942,
"learning_rate": 2.506884031244875e-07,
"loss": 1.0006,
"step": 460
},
{
"epoch": 0.677443056576047,
"grad_norm": 1.7540544809473422,
"learning_rate": 2.4862508934647215e-07,
"loss": 1.0033,
"step": 461
},
{
"epoch": 0.6789125642909625,
"grad_norm": 1.7585631184248087,
"learning_rate": 2.465674883816492e-07,
"loss": 1.0656,
"step": 462
},
{
"epoch": 0.6803820720058781,
"grad_norm": 1.808996319528907,
"learning_rate": 2.445156469916059e-07,
"loss": 0.9714,
"step": 463
},
{
"epoch": 0.6818515797207936,
"grad_norm": 1.809843495968243,
"learning_rate": 2.424696118070367e-07,
"loss": 1.0581,
"step": 464
},
{
"epoch": 0.6833210874357091,
"grad_norm": 1.6670442041884845,
"learning_rate": 2.404294293266823e-07,
"loss": 0.9425,
"step": 465
},
{
"epoch": 0.6847905951506246,
"grad_norm": 1.76399522732867,
"learning_rate": 2.3839514591627298e-07,
"loss": 1.0518,
"step": 466
},
{
"epoch": 0.6862601028655401,
"grad_norm": 1.7332927507318467,
"learning_rate": 2.3636680780747574e-07,
"loss": 1.0519,
"step": 467
},
{
"epoch": 0.6877296105804556,
"grad_norm": 1.7593209946714115,
"learning_rate": 2.3434446109684303e-07,
"loss": 1.0678,
"step": 468
},
{
"epoch": 0.6891991182953711,
"grad_norm": 1.844987519782162,
"learning_rate": 2.323281517447646e-07,
"loss": 1.0948,
"step": 469
},
{
"epoch": 0.6906686260102866,
"grad_norm": 1.7890685035001295,
"learning_rate": 2.3031792557442426e-07,
"loss": 0.9799,
"step": 470
},
{
"epoch": 0.692138133725202,
"grad_norm": 1.8257310859124949,
"learning_rate": 2.2831382827075758e-07,
"loss": 0.9977,
"step": 471
},
{
"epoch": 0.6936076414401176,
"grad_norm": 1.863003326170686,
"learning_rate": 2.2631590537941348e-07,
"loss": 1.0266,
"step": 472
},
{
"epoch": 0.695077149155033,
"grad_norm": 1.796551415142973,
"learning_rate": 2.2432420230572014e-07,
"loss": 1.05,
"step": 473
},
{
"epoch": 0.6965466568699485,
"grad_norm": 1.7791843008462636,
"learning_rate": 2.223387643136524e-07,
"loss": 1.0631,
"step": 474
},
{
"epoch": 0.698016164584864,
"grad_norm": 1.8342190635267368,
"learning_rate": 2.2035963652480266e-07,
"loss": 1.0214,
"step": 475
},
{
"epoch": 0.6994856722997795,
"grad_norm": 1.8734533497691483,
"learning_rate": 2.183868639173568e-07,
"loss": 0.9609,
"step": 476
},
{
"epoch": 0.700955180014695,
"grad_norm": 1.7146756401235461,
"learning_rate": 2.1642049132507013e-07,
"loss": 1.0848,
"step": 477
},
{
"epoch": 0.7024246877296105,
"grad_norm": 1.8197278079364136,
"learning_rate": 2.144605634362504e-07,
"loss": 0.9978,
"step": 478
},
{
"epoch": 0.7038941954445261,
"grad_norm": 1.8066318762651958,
"learning_rate": 2.125071247927412e-07,
"loss": 1.001,
"step": 479
},
{
"epoch": 0.7053637031594416,
"grad_norm": 1.8332236077939572,
"learning_rate": 2.1056021978890915e-07,
"loss": 1.0248,
"step": 480
},
{
"epoch": 0.7068332108743571,
"grad_norm": 1.752732727944847,
"learning_rate": 2.0861989267063622e-07,
"loss": 1.0468,
"step": 481
},
{
"epoch": 0.7083027185892726,
"grad_norm": 1.760265234031662,
"learning_rate": 2.0668618753431372e-07,
"loss": 1.0598,
"step": 482
},
{
"epoch": 0.7097722263041881,
"grad_norm": 1.830543381313312,
"learning_rate": 2.0475914832583936e-07,
"loss": 1.0581,
"step": 483
},
{
"epoch": 0.7112417340191036,
"grad_norm": 1.7527247281185463,
"learning_rate": 2.0283881883961978e-07,
"loss": 1.0606,
"step": 484
},
{
"epoch": 0.7127112417340191,
"grad_norm": 1.7445145256230117,
"learning_rate": 2.0092524271757472e-07,
"loss": 1.0332,
"step": 485
},
{
"epoch": 0.7141807494489346,
"grad_norm": 1.8241619108049254,
"learning_rate": 1.990184634481446e-07,
"loss": 1.0245,
"step": 486
},
{
"epoch": 0.7156502571638501,
"grad_norm": 1.875499142180598,
"learning_rate": 1.9711852436530318e-07,
"loss": 1.0644,
"step": 487
},
{
"epoch": 0.7171197648787656,
"grad_norm": 1.846389740515158,
"learning_rate": 1.952254686475726e-07,
"loss": 1.0553,
"step": 488
},
{
"epoch": 0.7185892725936811,
"grad_norm": 1.8163254096673638,
"learning_rate": 1.9333933931704098e-07,
"loss": 0.998,
"step": 489
},
{
"epoch": 0.7200587803085966,
"grad_norm": 1.7623549998600847,
"learning_rate": 1.914601792383862e-07,
"loss": 1.0027,
"step": 490
},
{
"epoch": 0.7215282880235121,
"grad_norm": 1.714869784410914,
"learning_rate": 1.8958803111790105e-07,
"loss": 1.0056,
"step": 491
},
{
"epoch": 0.7229977957384276,
"grad_norm": 1.7876284758260355,
"learning_rate": 1.877229375025222e-07,
"loss": 1.0857,
"step": 492
},
{
"epoch": 0.7244673034533431,
"grad_norm": 1.6960378155490023,
"learning_rate": 1.8586494077886416e-07,
"loss": 1.0358,
"step": 493
},
{
"epoch": 0.7259368111682586,
"grad_norm": 1.8958781149495285,
"learning_rate": 1.840140831722557e-07,
"loss": 1.0456,
"step": 494
},
{
"epoch": 0.7274063188831741,
"grad_norm": 1.847902129513823,
"learning_rate": 1.821704067457795e-07,
"loss": 1.0299,
"step": 495
},
{
"epoch": 0.7288758265980897,
"grad_norm": 1.781753600620522,
"learning_rate": 1.803339533993175e-07,
"loss": 1.0461,
"step": 496
},
{
"epoch": 0.7303453343130052,
"grad_norm": 1.7044619507439027,
"learning_rate": 1.7850476486859784e-07,
"loss": 1.0307,
"step": 497
},
{
"epoch": 0.7318148420279207,
"grad_norm": 1.7635434595560795,
"learning_rate": 1.766828827242461e-07,
"loss": 1.0086,
"step": 498
},
{
"epoch": 0.7332843497428362,
"grad_norm": 1.7820427578619338,
"learning_rate": 1.7486834837084147e-07,
"loss": 1.1165,
"step": 499
},
{
"epoch": 0.7347538574577517,
"grad_norm": 1.9097746083160565,
"learning_rate": 1.7306120304597516e-07,
"loss": 1.0461,
"step": 500
},
{
"epoch": 0.7347538574577517,
"eval_ical_mcts_chains_sft_val_MORECHAINS_loss": 2.7085540294647217,
"eval_ical_mcts_chains_sft_val_MORECHAINS_runtime": 17.8085,
"eval_ical_mcts_chains_sft_val_MORECHAINS_samples_per_second": 5.222,
"eval_ical_mcts_chains_sft_val_MORECHAINS_steps_per_second": 0.674,
"step": 500
},
{
"epoch": 0.7362233651726672,
"grad_norm": 1.783765348036335,
"learning_rate": 1.7126148781931309e-07,
"loss": 1.1337,
"step": 501
},
{
"epoch": 0.7376928728875827,
"grad_norm": 1.7555139522882506,
"learning_rate": 1.6946924359166332e-07,
"loss": 1.0658,
"step": 502
},
{
"epoch": 0.7391623806024982,
"grad_norm": 1.740923755234073,
"learning_rate": 1.6768451109404518e-07,
"loss": 1.0809,
"step": 503
},
{
"epoch": 0.7406318883174137,
"grad_norm": 1.7576518353306747,
"learning_rate": 1.659073308867653e-07,
"loss": 1.0141,
"step": 504
},
{
"epoch": 0.7421013960323292,
"grad_norm": 1.8195633960647817,
"learning_rate": 1.641377433584945e-07,
"loss": 1.0799,
"step": 505
},
{
"epoch": 0.7435709037472447,
"grad_norm": 1.724184703939458,
"learning_rate": 1.6237578872535023e-07,
"loss": 1.0377,
"step": 506
},
{
"epoch": 0.7450404114621602,
"grad_norm": 1.7366929295964126,
"learning_rate": 1.6062150702998307e-07,
"loss": 1.0373,
"step": 507
},
{
"epoch": 0.7465099191770757,
"grad_norm": 1.828841387885778,
"learning_rate": 1.5887493814066632e-07,
"loss": 1.0053,
"step": 508
},
{
"epoch": 0.7479794268919912,
"grad_norm": 1.799844847030494,
"learning_rate": 1.5713612175038953e-07,
"loss": 1.0182,
"step": 509
},
{
"epoch": 0.7494489346069066,
"grad_norm": 1.7736398311243526,
"learning_rate": 1.5540509737595752e-07,
"loss": 0.9885,
"step": 510
},
{
"epoch": 0.7509184423218221,
"grad_norm": 1.7318314688384178,
"learning_rate": 1.536819043570915e-07,
"loss": 1.0109,
"step": 511
},
{
"epoch": 0.7523879500367376,
"grad_norm": 1.7009255541723038,
"learning_rate": 1.5196658185553484e-07,
"loss": 1.0023,
"step": 512
},
{
"epoch": 0.7538574577516532,
"grad_norm": 1.7989686822366986,
"learning_rate": 1.5025916885416385e-07,
"loss": 1.0307,
"step": 513
},
{
"epoch": 0.7553269654665687,
"grad_norm": 1.8392247482233148,
"learning_rate": 1.485597041561014e-07,
"loss": 1.1094,
"step": 514
},
{
"epoch": 0.7567964731814842,
"grad_norm": 1.7569776212028672,
"learning_rate": 1.4686822638383485e-07,
"loss": 1.0628,
"step": 515
},
{
"epoch": 0.7582659808963997,
"grad_norm": 1.7919919083958182,
"learning_rate": 1.4518477397833868e-07,
"loss": 1.0299,
"step": 516
},
{
"epoch": 0.7597354886113152,
"grad_norm": 1.7659155447820256,
"learning_rate": 1.4350938519820082e-07,
"loss": 1.0487,
"step": 517
},
{
"epoch": 0.7612049963262307,
"grad_norm": 1.7726004655315755,
"learning_rate": 1.4184209811875314e-07,
"loss": 0.9892,
"step": 518
},
{
"epoch": 0.7626745040411462,
"grad_norm": 1.7352203131127852,
"learning_rate": 1.401829506312061e-07,
"loss": 1.0795,
"step": 519
},
{
"epoch": 0.7641440117560617,
"grad_norm": 1.7457325967667972,
"learning_rate": 1.385319804417872e-07,
"loss": 1.0278,
"step": 520
},
{
"epoch": 0.7656135194709772,
"grad_norm": 1.7557251595785084,
"learning_rate": 1.3688922507088506e-07,
"loss": 1.0023,
"step": 521
},
{
"epoch": 0.7670830271858927,
"grad_norm": 1.8500663234582253,
"learning_rate": 1.35254721852196e-07,
"loss": 1.1031,
"step": 522
},
{
"epoch": 0.7685525349008082,
"grad_norm": 1.7675527728702798,
"learning_rate": 1.3362850793187536e-07,
"loss": 1.0998,
"step": 523
},
{
"epoch": 0.7700220426157237,
"grad_norm": 1.7802520405309632,
"learning_rate": 1.3201062026769415e-07,
"loss": 1.081,
"step": 524
},
{
"epoch": 0.7714915503306392,
"grad_norm": 1.8571095737653232,
"learning_rate": 1.3040109562819852e-07,
"loss": 1.0501,
"step": 525
},
{
"epoch": 0.7729610580455547,
"grad_norm": 1.737514001066965,
"learning_rate": 1.2879997059187402e-07,
"loss": 1.1032,
"step": 526
},
{
"epoch": 0.7744305657604702,
"grad_norm": 1.7415002496273955,
"learning_rate": 1.27207281546315e-07,
"loss": 1.0373,
"step": 527
},
{
"epoch": 0.7759000734753857,
"grad_norm": 1.7233552370133884,
"learning_rate": 1.2562306468739707e-07,
"loss": 1.0011,
"step": 528
},
{
"epoch": 0.7773695811903012,
"grad_norm": 1.7771726750205339,
"learning_rate": 1.2404735601845446e-07,
"loss": 1.0235,
"step": 529
},
{
"epoch": 0.7788390889052168,
"grad_norm": 1.7589516685296818,
"learning_rate": 1.2248019134946224e-07,
"loss": 1.0871,
"step": 530
},
{
"epoch": 0.7803085966201323,
"grad_norm": 1.8904879764816582,
"learning_rate": 1.2092160629622243e-07,
"loss": 1.1743,
"step": 531
},
{
"epoch": 0.7817781043350478,
"grad_norm": 1.679478054850629,
"learning_rate": 1.1937163627955388e-07,
"loss": 0.9987,
"step": 532
},
{
"epoch": 0.7832476120499633,
"grad_norm": 1.7700957005048465,
"learning_rate": 1.1783031652448844e-07,
"loss": 1.013,
"step": 533
},
{
"epoch": 0.7847171197648788,
"grad_norm": 1.698212822022776,
"learning_rate": 1.1629768205946916e-07,
"loss": 1.0289,
"step": 534
},
{
"epoch": 0.7861866274797943,
"grad_norm": 1.6982580846175706,
"learning_rate": 1.1477376771555547e-07,
"loss": 1.0437,
"step": 535
},
{
"epoch": 0.7876561351947098,
"grad_norm": 1.8951494938436424,
"learning_rate": 1.1325860812563082e-07,
"loss": 1.0241,
"step": 536
},
{
"epoch": 0.7891256429096253,
"grad_norm": 1.7809478211990022,
"learning_rate": 1.1175223772361548e-07,
"loss": 1.0971,
"step": 537
},
{
"epoch": 0.7905951506245408,
"grad_norm": 1.8138774494697176,
"learning_rate": 1.1025469074368465e-07,
"loss": 1.0308,
"step": 538
},
{
"epoch": 0.7920646583394563,
"grad_norm": 1.7706413795439244,
"learning_rate": 1.0876600121949014e-07,
"loss": 0.9894,
"step": 539
},
{
"epoch": 0.7935341660543718,
"grad_norm": 1.7128344759591312,
"learning_rate": 1.0728620298338647e-07,
"loss": 0.9308,
"step": 540
},
{
"epoch": 0.7950036737692873,
"grad_norm": 1.7376472280744786,
"learning_rate": 1.058153296656627e-07,
"loss": 0.9786,
"step": 541
},
{
"epoch": 0.7964731814842028,
"grad_norm": 1.6180557950522634,
"learning_rate": 1.0435341469377785e-07,
"loss": 0.9778,
"step": 542
},
{
"epoch": 0.7979426891991183,
"grad_norm": 1.779926196605351,
"learning_rate": 1.0290049129160083e-07,
"loss": 1.0213,
"step": 543
},
{
"epoch": 0.7994121969140338,
"grad_norm": 1.7660627522019654,
"learning_rate": 1.0145659247865606e-07,
"loss": 0.9912,
"step": 544
},
{
"epoch": 0.8008817046289493,
"grad_norm": 1.7480192933335235,
"learning_rate": 1.0002175106937282e-07,
"loss": 1.0209,
"step": 545
},
{
"epoch": 0.8023512123438648,
"grad_norm": 1.771603438272948,
"learning_rate": 9.859599967233901e-08,
"loss": 1.0015,
"step": 546
},
{
"epoch": 0.8038207200587804,
"grad_norm": 1.6631521749746068,
"learning_rate": 9.717937068956083e-08,
"loss": 1.0255,
"step": 547
},
{
"epoch": 0.8052902277736959,
"grad_norm": 1.725895874046473,
"learning_rate": 9.577189631572613e-08,
"loss": 1.0477,
"step": 548
},
{
"epoch": 0.8067597354886114,
"grad_norm": 1.7384294189757843,
"learning_rate": 9.437360853747223e-08,
"loss": 1.0759,
"step": 549
},
{
"epoch": 0.8082292432035268,
"grad_norm": 1.7812991412940928,
"learning_rate": 9.29845391326598e-08,
"loss": 0.9964,
"step": 550
},
{
"epoch": 0.8096987509184423,
"grad_norm": 1.8853925917414667,
"learning_rate": 9.16047196696505e-08,
"loss": 1.0252,
"step": 551
},
{
"epoch": 0.8111682586333578,
"grad_norm": 1.87531769900298,
"learning_rate": 9.023418150658863e-08,
"loss": 1.0244,
"step": 552
},
{
"epoch": 0.8126377663482733,
"grad_norm": 1.7505658777412774,
"learning_rate": 8.887295579068988e-08,
"loss": 0.9692,
"step": 553
},
{
"epoch": 0.8141072740631888,
"grad_norm": 1.7482379564431354,
"learning_rate": 8.752107345753262e-08,
"loss": 0.958,
"step": 554
},
{
"epoch": 0.8155767817781043,
"grad_norm": 1.7375873910532686,
"learning_rate": 8.617856523035466e-08,
"loss": 1.034,
"step": 555
},
{
"epoch": 0.8170462894930198,
"grad_norm": 1.7433829403227212,
"learning_rate": 8.484546161935596e-08,
"loss": 1.011,
"step": 556
},
{
"epoch": 0.8185157972079353,
"grad_norm": 1.8154504530341387,
"learning_rate": 8.352179292100403e-08,
"loss": 1.0048,
"step": 557
},
{
"epoch": 0.8199853049228508,
"grad_norm": 1.758428463882583,
"learning_rate": 8.220758921734649e-08,
"loss": 0.9964,
"step": 558
},
{
"epoch": 0.8214548126377663,
"grad_norm": 1.7379172896067956,
"learning_rate": 8.090288037532706e-08,
"loss": 1.0351,
"step": 559
},
{
"epoch": 0.8229243203526818,
"grad_norm": 1.8067076107445252,
"learning_rate": 7.960769604610618e-08,
"loss": 1.0556,
"step": 560
},
{
"epoch": 0.8243938280675973,
"grad_norm": 1.7093430682749875,
"learning_rate": 7.83220656643881e-08,
"loss": 0.9723,
"step": 561
},
{
"epoch": 0.8258633357825128,
"grad_norm": 1.7301749018028927,
"learning_rate": 7.704601844775155e-08,
"loss": 1.0169,
"step": 562
},
{
"epoch": 0.8273328434974284,
"grad_norm": 1.9789670450938064,
"learning_rate": 7.577958339598529e-08,
"loss": 0.9434,
"step": 563
},
{
"epoch": 0.8288023512123439,
"grad_norm": 1.8296698182174271,
"learning_rate": 7.452278929042982e-08,
"loss": 1.0645,
"step": 564
},
{
"epoch": 0.8302718589272594,
"grad_norm": 1.732152158537409,
"learning_rate": 7.327566469332303e-08,
"loss": 1.0619,
"step": 565
},
{
"epoch": 0.8317413666421749,
"grad_norm": 1.7544744051868846,
"learning_rate": 7.203823794715041e-08,
"loss": 1.0365,
"step": 566
},
{
"epoch": 0.8332108743570904,
"grad_norm": 1.6893639940770133,
"learning_rate": 7.08105371740021e-08,
"loss": 1.0231,
"step": 567
},
{
"epoch": 0.8346803820720059,
"grad_norm": 1.8424586490873391,
"learning_rate": 6.959259027493303e-08,
"loss": 1.0513,
"step": 568
},
{
"epoch": 0.8361498897869214,
"grad_norm": 1.773145972559333,
"learning_rate": 6.838442492932867e-08,
"loss": 1.0079,
"step": 569
},
{
"epoch": 0.8376193975018369,
"grad_norm": 1.8325252478515535,
"learning_rate": 6.718606859427673e-08,
"loss": 1.0529,
"step": 570
},
{
"epoch": 0.8390889052167524,
"grad_norm": 1.7519464554535977,
"learning_rate": 6.599754850394263e-08,
"loss": 1.0249,
"step": 571
},
{
"epoch": 0.8405584129316679,
"grad_norm": 1.7817640686446343,
"learning_rate": 6.481889166895033e-08,
"loss": 0.9944,
"step": 572
},
{
"epoch": 0.8420279206465834,
"grad_norm": 1.692594190403071,
"learning_rate": 6.365012487576926e-08,
"loss": 0.9993,
"step": 573
},
{
"epoch": 0.8434974283614989,
"grad_norm": 1.795806347454193,
"learning_rate": 6.249127468610504e-08,
"loss": 1.0714,
"step": 574
},
{
"epoch": 0.8449669360764144,
"grad_norm": 1.7511008874312695,
"learning_rate": 6.134236743629562e-08,
"loss": 1.0483,
"step": 575
},
{
"epoch": 0.8464364437913299,
"grad_norm": 1.8161885855117905,
"learning_rate": 6.020342923671334e-08,
"loss": 0.9827,
"step": 576
},
{
"epoch": 0.8479059515062454,
"grad_norm": 1.769523032001142,
"learning_rate": 5.907448597117126e-08,
"loss": 1.0706,
"step": 577
},
{
"epoch": 0.8493754592211609,
"grad_norm": 1.7280914783545198,
"learning_rate": 5.7955563296334664e-08,
"loss": 1.0984,
"step": 578
},
{
"epoch": 0.8508449669360764,
"grad_norm": 1.748415591220106,
"learning_rate": 5.6846686641138394e-08,
"loss": 1.0084,
"step": 579
},
{
"epoch": 0.852314474650992,
"grad_norm": 1.772067208429697,
"learning_rate": 5.5747881206208936e-08,
"loss": 1.0337,
"step": 580
},
{
"epoch": 0.8537839823659075,
"grad_norm": 1.794873769633449,
"learning_rate": 5.465917196329106e-08,
"loss": 1.0163,
"step": 581
},
{
"epoch": 0.855253490080823,
"grad_norm": 1.65717460494895,
"learning_rate": 5.3580583654681266e-08,
"loss": 0.9955,
"step": 582
},
{
"epoch": 0.8567229977957385,
"grad_norm": 1.8162911342334482,
"learning_rate": 5.251214079266475e-08,
"loss": 1.0089,
"step": 583
},
{
"epoch": 0.858192505510654,
"grad_norm": 1.7542998181212444,
"learning_rate": 5.1453867658958704e-08,
"loss": 1.0556,
"step": 584
},
{
"epoch": 0.8596620132255695,
"grad_norm": 1.8004202990199547,
"learning_rate": 5.0405788304160426e-08,
"loss": 1.0302,
"step": 585
},
{
"epoch": 0.861131520940485,
"grad_norm": 1.8579500585467426,
"learning_rate": 4.936792654720029e-08,
"loss": 0.9883,
"step": 586
},
{
"epoch": 0.8626010286554004,
"grad_norm": 1.8684587727448756,
"learning_rate": 4.8340305974801266e-08,
"loss": 1.0037,
"step": 587
},
{
"epoch": 0.864070536370316,
"grad_norm": 1.7812638910539453,
"learning_rate": 4.7322949940942325e-08,
"loss": 0.9856,
"step": 588
},
{
"epoch": 0.8655400440852314,
"grad_norm": 1.7535934340855097,
"learning_rate": 4.63158815663276e-08,
"loss": 0.9972,
"step": 589
},
{
"epoch": 0.8670095518001469,
"grad_norm": 1.6677865713866784,
"learning_rate": 4.53191237378614e-08,
"loss": 0.9824,
"step": 590
},
{
"epoch": 0.8684790595150624,
"grad_norm": 1.7665411769395538,
"learning_rate": 4.433269910812759e-08,
"loss": 1.0083,
"step": 591
},
{
"epoch": 0.8699485672299779,
"grad_norm": 1.7251596215832548,
"learning_rate": 4.335663009487511e-08,
"loss": 1.1064,
"step": 592
},
{
"epoch": 0.8714180749448934,
"grad_norm": 1.7761940662623423,
"learning_rate": 4.2390938880508595e-08,
"loss": 0.9743,
"step": 593
},
{
"epoch": 0.8728875826598089,
"grad_norm": 1.7665619723627777,
"learning_rate": 4.143564741158362e-08,
"loss": 1.0002,
"step": 594
},
{
"epoch": 0.8743570903747244,
"grad_norm": 1.7774593865163342,
"learning_rate": 4.0490777398308753e-08,
"loss": 1.1467,
"step": 595
},
{
"epoch": 0.8758265980896399,
"grad_norm": 1.8769247217431873,
"learning_rate": 3.955635031405169e-08,
"loss": 1.0388,
"step": 596
},
{
"epoch": 0.8772961058045555,
"grad_norm": 1.696854448182369,
"learning_rate": 3.86323873948512e-08,
"loss": 1.1079,
"step": 597
},
{
"epoch": 0.878765613519471,
"grad_norm": 1.740381033337976,
"learning_rate": 3.771890963893476e-08,
"loss": 1.0556,
"step": 598
},
{
"epoch": 0.8802351212343865,
"grad_norm": 1.8733585897024647,
"learning_rate": 3.681593780624137e-08,
"loss": 1.0254,
"step": 599
},
{
"epoch": 0.881704628949302,
"grad_norm": 1.7391199819992769,
"learning_rate": 3.5923492417949285e-08,
"loss": 1.0785,
"step": 600
},
{
"epoch": 0.8831741366642175,
"grad_norm": 1.768770527552518,
"learning_rate": 3.5041593756010234e-08,
"loss": 0.9474,
"step": 601
},
{
"epoch": 0.884643644379133,
"grad_norm": 1.7597254899905834,
"learning_rate": 3.417026186268829e-08,
"loss": 1.0168,
"step": 602
},
{
"epoch": 0.8861131520940485,
"grad_norm": 1.766391016102045,
"learning_rate": 3.3309516540104e-08,
"loss": 1.0638,
"step": 603
},
{
"epoch": 0.887582659808964,
"grad_norm": 1.7917528115242387,
"learning_rate": 3.2459377349784986e-08,
"loss": 1.0187,
"step": 604
},
{
"epoch": 0.8890521675238795,
"grad_norm": 1.8178797362114174,
"learning_rate": 3.1619863612221075e-08,
"loss": 1.1161,
"step": 605
},
{
"epoch": 0.890521675238795,
"grad_norm": 1.8510839028666215,
"learning_rate": 3.079099440642496e-08,
"loss": 1.1099,
"step": 606
},
{
"epoch": 0.8919911829537105,
"grad_norm": 1.8373581400255845,
"learning_rate": 2.997278856949914e-08,
"loss": 1.0275,
"step": 607
},
{
"epoch": 0.893460690668626,
"grad_norm": 1.7886561701096182,
"learning_rate": 2.916526469620756e-08,
"loss": 1.0033,
"step": 608
},
{
"epoch": 0.8949301983835415,
"grad_norm": 1.807687857402709,
"learning_rate": 2.836844113855269e-08,
"loss": 1.1667,
"step": 609
},
{
"epoch": 0.896399706098457,
"grad_norm": 1.9522297555824326,
"learning_rate": 2.758233600535914e-08,
"loss": 1.0008,
"step": 610
},
{
"epoch": 0.8978692138133725,
"grad_norm": 1.746748545987193,
"learning_rate": 2.6806967161861593e-08,
"loss": 0.9866,
"step": 611
},
{
"epoch": 0.899338721528288,
"grad_norm": 1.8722599434434155,
"learning_rate": 2.6042352229298902e-08,
"loss": 1.0111,
"step": 612
},
{
"epoch": 0.9008082292432035,
"grad_norm": 1.9844512513700931,
"learning_rate": 2.5288508584513814e-08,
"loss": 1.0208,
"step": 613
},
{
"epoch": 0.9022777369581191,
"grad_norm": 1.8438532955412186,
"learning_rate": 2.4545453359557765e-08,
"loss": 1.0134,
"step": 614
},
{
"epoch": 0.9037472446730346,
"grad_norm": 1.997698936619193,
"learning_rate": 2.3813203441301778e-08,
"loss": 1.0442,
"step": 615
},
{
"epoch": 0.9052167523879501,
"grad_norm": 1.8633689608393182,
"learning_rate": 2.3091775471052734e-08,
"loss": 1.0536,
"step": 616
},
{
"epoch": 0.9066862601028656,
"grad_norm": 1.803504912241452,
"learning_rate": 2.2381185844174644e-08,
"loss": 0.9618,
"step": 617
},
{
"epoch": 0.9081557678177811,
"grad_norm": 1.842132096723327,
"learning_rate": 2.168145070971683e-08,
"loss": 1.0246,
"step": 618
},
{
"epoch": 0.9096252755326966,
"grad_norm": 1.6938059931200713,
"learning_rate": 2.099258597004644e-08,
"loss": 0.9949,
"step": 619
},
{
"epoch": 0.9110947832476121,
"grad_norm": 1.8449249437590283,
"learning_rate": 2.031460728048695e-08,
"loss": 0.9308,
"step": 620
},
{
"epoch": 0.9125642909625276,
"grad_norm": 1.7680792178846148,
"learning_rate": 1.9647530048962747e-08,
"loss": 1.0323,
"step": 621
},
{
"epoch": 0.914033798677443,
"grad_norm": 1.7245217442805951,
"learning_rate": 1.8991369435648774e-08,
"loss": 1.066,
"step": 622
},
{
"epoch": 0.9155033063923586,
"grad_norm": 1.6824175711845812,
"learning_rate": 1.8346140352625883e-08,
"loss": 0.9527,
"step": 623
},
{
"epoch": 0.916972814107274,
"grad_norm": 1.8432023340759272,
"learning_rate": 1.771185746354209e-08,
"loss": 1.0189,
"step": 624
},
{
"epoch": 0.9184423218221895,
"grad_norm": 1.710169997477168,
"learning_rate": 1.7088535183279407e-08,
"loss": 1.0606,
"step": 625
},
{
"epoch": 0.919911829537105,
"grad_norm": 1.7851652504906605,
"learning_rate": 1.647618767762593e-08,
"loss": 1.0862,
"step": 626
},
{
"epoch": 0.9213813372520205,
"grad_norm": 1.7742564953179376,
"learning_rate": 1.5874828862954327e-08,
"loss": 0.9628,
"step": 627
},
{
"epoch": 0.922850844966936,
"grad_norm": 1.7396155476524742,
"learning_rate": 1.5284472405905247e-08,
"loss": 1.0092,
"step": 628
},
{
"epoch": 0.9243203526818515,
"grad_norm": 1.8451127978395603,
"learning_rate": 1.4705131723076692e-08,
"loss": 1.0866,
"step": 629
},
{
"epoch": 0.925789860396767,
"grad_norm": 1.7647190484524309,
"learning_rate": 1.4136819980719472e-08,
"loss": 0.9677,
"step": 630
},
{
"epoch": 0.9272593681116826,
"grad_norm": 1.7313371584788355,
"learning_rate": 1.3579550094437676e-08,
"loss": 1.0212,
"step": 631
},
{
"epoch": 0.9287288758265981,
"grad_norm": 1.8145993858567757,
"learning_rate": 1.3033334728895119e-08,
"loss": 1.1065,
"step": 632
},
{
"epoch": 0.9301983835415136,
"grad_norm": 1.717794777422694,
"learning_rate": 1.2498186297527802e-08,
"loss": 1.0607,
"step": 633
},
{
"epoch": 0.9316678912564291,
"grad_norm": 1.792322328153233,
"learning_rate": 1.1974116962261527e-08,
"loss": 1.0869,
"step": 634
},
{
"epoch": 0.9331373989713446,
"grad_norm": 1.693093472386974,
"learning_rate": 1.1461138633235611e-08,
"loss": 1.0148,
"step": 635
},
{
"epoch": 0.9346069066862601,
"grad_norm": 1.6734060797181096,
"learning_rate": 1.095926296853228e-08,
"loss": 1.1259,
"step": 636
},
{
"epoch": 0.9360764144011756,
"grad_norm": 1.7040871240195783,
"learning_rate": 1.0468501373911532e-08,
"loss": 1.0137,
"step": 637
},
{
"epoch": 0.9375459221160911,
"grad_norm": 1.7386557010935322,
"learning_rate": 9.988865002552138e-09,
"loss": 1.0393,
"step": 638
},
{
"epoch": 0.9390154298310066,
"grad_norm": 1.7617286641427552,
"learning_rate": 9.520364754798116e-09,
"loss": 1.0118,
"step": 639
},
{
"epoch": 0.9404849375459221,
"grad_norm": 1.6838936053556308,
"learning_rate": 9.06301127791087e-09,
"loss": 1.0124,
"step": 640
},
{
"epoch": 0.9419544452608376,
"grad_norm": 1.7685007619311106,
"learning_rate": 8.61681496582739e-09,
"loss": 1.0152,
"step": 641
},
{
"epoch": 0.9434239529757531,
"grad_norm": 1.7655039690066217,
"learning_rate": 8.181785958923938e-09,
"loss": 1.0482,
"step": 642
},
{
"epoch": 0.9448934606906686,
"grad_norm": 1.7291794104286682,
"learning_rate": 7.757934143785561e-09,
"loss": 1.0601,
"step": 643
},
{
"epoch": 0.9463629684055841,
"grad_norm": 1.7987300815999252,
"learning_rate": 7.345269152981614e-09,
"loss": 1.0143,
"step": 644
},
{
"epoch": 0.9478324761204996,
"grad_norm": 1.9004775415700146,
"learning_rate": 6.943800364846653e-09,
"loss": 1.0523,
"step": 645
},
{
"epoch": 0.9493019838354151,
"grad_norm": 1.8776605483787512,
"learning_rate": 6.5535369032672095e-09,
"loss": 1.0512,
"step": 646
},
{
"epoch": 0.9507714915503307,
"grad_norm": 1.838241026881048,
"learning_rate": 6.174487637474801e-09,
"loss": 1.0331,
"step": 647
},
{
"epoch": 0.9522409992652462,
"grad_norm": 1.8662114005878103,
"learning_rate": 5.806661181843919e-09,
"loss": 1.0158,
"step": 648
},
{
"epoch": 0.9537105069801617,
"grad_norm": 1.8290957478227903,
"learning_rate": 5.450065895696632e-09,
"loss": 1.0203,
"step": 649
},
{
"epoch": 0.9551800146950772,
"grad_norm": 1.816824005969282,
"learning_rate": 5.1047098831125124e-09,
"loss": 1.0538,
"step": 650
},
{
"epoch": 0.9566495224099927,
"grad_norm": 1.7373242719164779,
"learning_rate": 4.770600992744178e-09,
"loss": 1.0476,
"step": 651
},
{
"epoch": 0.9581190301249082,
"grad_norm": 1.7624643989224076,
"learning_rate": 4.4477468176393196e-09,
"loss": 0.9904,
"step": 652
},
{
"epoch": 0.9595885378398237,
"grad_norm": 1.6940800434963768,
"learning_rate": 4.136154695068006e-09,
"loss": 0.9917,
"step": 653
},
{
"epoch": 0.9610580455547392,
"grad_norm": 1.7767015691295511,
"learning_rate": 3.8358317063557635e-09,
"loss": 0.9986,
"step": 654
},
{
"epoch": 0.9625275532696547,
"grad_norm": 1.8376231539249999,
"learning_rate": 3.546784676722925e-09,
"loss": 1.0045,
"step": 655
},
{
"epoch": 0.9639970609845702,
"grad_norm": 1.7864023251483983,
"learning_rate": 3.2690201751292002e-09,
"loss": 1.0509,
"step": 656
},
{
"epoch": 0.9654665686994857,
"grad_norm": 1.7872367497837103,
"learning_rate": 3.002544514124683e-09,
"loss": 1.0038,
"step": 657
},
{
"epoch": 0.9669360764144012,
"grad_norm": 1.813415766447048,
"learning_rate": 2.747363749706244e-09,
"loss": 1.0775,
"step": 658
},
{
"epoch": 0.9684055841293167,
"grad_norm": 1.6985870578620685,
"learning_rate": 2.5034836811799744e-09,
"loss": 0.9979,
"step": 659
},
{
"epoch": 0.9698750918442322,
"grad_norm": 1.799775560049143,
"learning_rate": 2.2709098510292347e-09,
"loss": 1.0341,
"step": 660
},
{
"epoch": 0.9713445995591476,
"grad_norm": 1.79062920637246,
"learning_rate": 2.049647544788813e-09,
"loss": 0.9512,
"step": 661
},
{
"epoch": 0.9728141072740631,
"grad_norm": 1.7662794380324007,
"learning_rate": 1.8397017909249634e-09,
"loss": 1.0268,
"step": 662
},
{
"epoch": 0.9742836149889786,
"grad_norm": 1.8355770556221809,
"learning_rate": 1.6410773607206663e-09,
"loss": 1.0576,
"step": 663
},
{
"epoch": 0.9757531227038942,
"grad_norm": 1.8043186627448684,
"learning_rate": 1.4537787681677683e-09,
"loss": 1.0364,
"step": 664
},
{
"epoch": 0.9772226304188097,
"grad_norm": 1.7794287516609661,
"learning_rate": 1.2778102698638993e-09,
"loss": 1.0147,
"step": 665
},
{
"epoch": 0.9786921381337252,
"grad_norm": 1.7980949111509996,
"learning_rate": 1.1131758649160494e-09,
"loss": 1.0211,
"step": 666
},
{
"epoch": 0.9801616458486407,
"grad_norm": 1.7978516181015844,
"learning_rate": 9.598792948496414e-10,
"loss": 1.026,
"step": 667
},
{
"epoch": 0.9816311535635562,
"grad_norm": 1.749274502717889,
"learning_rate": 8.179240435232659e-10,
"loss": 1.0138,
"step": 668
},
{
"epoch": 0.9831006612784717,
"grad_norm": 1.6895077719396587,
"learning_rate": 6.873133370498551e-10,
"loss": 0.9675,
"step": 669
},
{
"epoch": 0.9845701689933872,
"grad_norm": 1.7863102632275802,
"learning_rate": 5.680501437230755e-10,
"loss": 1.039,
"step": 670
},
{
"epoch": 0.9860396767083027,
"grad_norm": 1.8150425834625068,
"learning_rate": 4.6013717395010365e-10,
"loss": 1.0414,
"step": 671
},
{
"epoch": 0.9875091844232182,
"grad_norm": 1.8149761019770996,
"learning_rate": 3.63576880189731e-10,
"loss": 1.0658,
"step": 672
},
{
"epoch": 0.9889786921381337,
"grad_norm": 1.773148981178707,
"learning_rate": 2.783714568970197e-10,
"loss": 1.0608,
"step": 673
},
{
"epoch": 0.9904481998530492,
"grad_norm": 1.8037996402075553,
"learning_rate": 2.045228404731203e-10,
"loss": 0.998,
"step": 674
},
{
"epoch": 0.9919177075679647,
"grad_norm": 1.7297660404171118,
"learning_rate": 1.4203270922125143e-10,
"loss": 0.9825,
"step": 675
},
{
"epoch": 0.9933872152828802,
"grad_norm": 1.9244611901290156,
"learning_rate": 9.090248330889671e-11,
"loss": 1.0265,
"step": 676
},
{
"epoch": 0.9948567229977957,
"grad_norm": 1.8339125626229225,
"learning_rate": 5.1133324735164183e-11,
"loss": 0.9576,
"step": 677
},
{
"epoch": 0.9963262307127112,
"grad_norm": 1.8612723910880795,
"learning_rate": 2.2726137304529546e-11,
"loss": 0.9926,
"step": 678
},
{
"epoch": 0.9977957384276267,
"grad_norm": 1.7458716115831705,
"learning_rate": 5.6815666063525505e-12,
"loss": 1.0218,
"step": 679
},
{
"epoch": 0.9992652461425422,
"grad_norm": 1.7932712816026757,
"learning_rate": 0.0,
"loss": 1.0722,
"step": 680
}
],
"logging_steps": 1,
"max_steps": 680,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 799593529671680.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}