{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 510, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00984009840098401, "grad_norm": 4.0806973528031865, "learning_rate": 0.0, "loss": 1.3287, "num_tokens": 423926.0, "step": 1 }, { "epoch": 0.01968019680196802, "grad_norm": 3.67747034614091, "learning_rate": 6.25e-07, "loss": 1.2135, "num_tokens": 900553.0, "step": 2 }, { "epoch": 0.02952029520295203, "grad_norm": 3.8171114533122408, "learning_rate": 1.25e-06, "loss": 1.2489, "num_tokens": 1354101.0, "step": 3 }, { "epoch": 0.03936039360393604, "grad_norm": 3.601160453205568, "learning_rate": 1.8750000000000003e-06, "loss": 1.2057, "num_tokens": 1831336.0, "step": 4 }, { "epoch": 0.04920049200492005, "grad_norm": 3.5064220048415957, "learning_rate": 2.5e-06, "loss": 1.2032, "num_tokens": 2299763.0, "step": 5 }, { "epoch": 0.05904059040590406, "grad_norm": 3.1555410715542345, "learning_rate": 3.125e-06, "loss": 1.2117, "num_tokens": 2739130.0, "step": 6 }, { "epoch": 0.06888068880688807, "grad_norm": 2.268607117088524, "learning_rate": 3.7500000000000005e-06, "loss": 1.0872, "num_tokens": 3186646.0, "step": 7 }, { "epoch": 0.07872078720787208, "grad_norm": 2.2099777833656136, "learning_rate": 4.3750000000000005e-06, "loss": 1.0835, "num_tokens": 3629799.0, "step": 8 }, { "epoch": 0.08856088560885608, "grad_norm": 1.8443437689000939, "learning_rate": 5e-06, "loss": 0.8645, "num_tokens": 4083162.0, "step": 9 }, { "epoch": 0.0984009840098401, "grad_norm": 1.9131458467328875, "learning_rate": 5.625e-06, "loss": 0.884, "num_tokens": 4514571.0, "step": 10 }, { "epoch": 0.10824108241082411, "grad_norm": 1.6995323396674717, "learning_rate": 6.25e-06, "loss": 0.7743, "num_tokens": 4990034.0, "step": 11 }, { "epoch": 0.11808118081180811, "grad_norm": 1.9772572417592447, "learning_rate": 6.875e-06, "loss": 0.53, "num_tokens": 5446252.0, "step": 12 }, { "epoch": 0.12792127921279212, "grad_norm": 2.235467875434991, "learning_rate": 7.500000000000001e-06, "loss": 0.4443, "num_tokens": 5880398.0, "step": 13 }, { "epoch": 0.13776137761377613, "grad_norm": 1.779888369557262, "learning_rate": 8.125000000000001e-06, "loss": 0.3535, "num_tokens": 6317527.0, "step": 14 }, { "epoch": 0.14760147601476015, "grad_norm": 1.4174646561758966, "learning_rate": 8.750000000000001e-06, "loss": 0.257, "num_tokens": 6768988.0, "step": 15 }, { "epoch": 0.15744157441574416, "grad_norm": 0.825210223865691, "learning_rate": 9.375000000000001e-06, "loss": 0.1351, "num_tokens": 7212525.0, "step": 16 }, { "epoch": 0.16728167281672818, "grad_norm": 0.5272789553691977, "learning_rate": 1e-05, "loss": 0.1092, "num_tokens": 7660418.0, "step": 17 }, { "epoch": 0.17712177121771217, "grad_norm": 0.4895013664227461, "learning_rate": 9.999909003036192e-06, "loss": 0.3213, "num_tokens": 8107247.0, "step": 18 }, { "epoch": 0.18696186961869618, "grad_norm": 0.2368744364092584, "learning_rate": 9.99963601582496e-06, "loss": 0.0892, "num_tokens": 8547828.0, "step": 19 }, { "epoch": 0.1968019680196802, "grad_norm": 0.17827168672763916, "learning_rate": 9.999181049406756e-06, "loss": 0.0814, "num_tokens": 8995945.0, "step": 20 }, { "epoch": 0.2066420664206642, "grad_norm": 0.1790643674004139, "learning_rate": 9.998544122181829e-06, "loss": 0.07, "num_tokens": 9428292.0, "step": 21 }, { "epoch": 0.21648216482164823, "grad_norm": 0.17378868386224117, "learning_rate": 9.997725259909487e-06, "loss": 0.0822, "num_tokens": 9872391.0, "step": 22 }, { "epoch": 0.22632226322263221, "grad_norm": 0.16172795763425857, "learning_rate": 9.996724495707056e-06, "loss": 0.0695, "num_tokens": 10321265.0, "step": 23 }, { "epoch": 0.23616236162361623, "grad_norm": 0.15374263186536802, "learning_rate": 9.995541870048537e-06, "loss": 0.0699, "num_tokens": 10756941.0, "step": 24 }, { "epoch": 0.24600246002460024, "grad_norm": 0.13904547469173764, "learning_rate": 9.994177430762971e-06, "loss": 0.0654, "num_tokens": 11211102.0, "step": 25 }, { "epoch": 0.25584255842558423, "grad_norm": 0.1313831102836863, "learning_rate": 9.992631233032507e-06, "loss": 0.0575, "num_tokens": 11655921.0, "step": 26 }, { "epoch": 0.2656826568265683, "grad_norm": 0.12358184868008147, "learning_rate": 9.990903339390164e-06, "loss": 0.0654, "num_tokens": 12123406.0, "step": 27 }, { "epoch": 0.27552275522755226, "grad_norm": 0.13102947055348427, "learning_rate": 9.988993819717312e-06, "loss": 0.0576, "num_tokens": 12571100.0, "step": 28 }, { "epoch": 0.2853628536285363, "grad_norm": 0.12370667408412227, "learning_rate": 9.986902751240836e-06, "loss": 0.0672, "num_tokens": 13004426.0, "step": 29 }, { "epoch": 0.2952029520295203, "grad_norm": 0.11805846922575773, "learning_rate": 9.984630218530014e-06, "loss": 0.0653, "num_tokens": 13453098.0, "step": 30 }, { "epoch": 0.3050430504305043, "grad_norm": 0.12596387207866033, "learning_rate": 9.982176313493108e-06, "loss": 0.0531, "num_tokens": 13910634.0, "step": 31 }, { "epoch": 0.3148831488314883, "grad_norm": 0.10677675254365898, "learning_rate": 9.979541135373628e-06, "loss": 0.0422, "num_tokens": 14359546.0, "step": 32 }, { "epoch": 0.3247232472324723, "grad_norm": 0.11973442758561412, "learning_rate": 9.976724790746333e-06, "loss": 0.0681, "num_tokens": 14810794.0, "step": 33 }, { "epoch": 0.33456334563345635, "grad_norm": 0.09765768004768276, "learning_rate": 9.973727393512921e-06, "loss": 0.0513, "num_tokens": 15282306.0, "step": 34 }, { "epoch": 0.34440344403444034, "grad_norm": 0.09901017562060792, "learning_rate": 9.970549064897407e-06, "loss": 0.0461, "num_tokens": 15719140.0, "step": 35 }, { "epoch": 0.35424354243542433, "grad_norm": 0.08726438112480077, "learning_rate": 9.967189933441243e-06, "loss": 0.0546, "num_tokens": 16160794.0, "step": 36 }, { "epoch": 0.3640836408364084, "grad_norm": 0.0862429330782484, "learning_rate": 9.9636501349981e-06, "loss": 0.0482, "num_tokens": 16594103.0, "step": 37 }, { "epoch": 0.37392373923739236, "grad_norm": 0.08703197270042312, "learning_rate": 9.95992981272838e-06, "loss": 0.0431, "num_tokens": 17050487.0, "step": 38 }, { "epoch": 0.3837638376383764, "grad_norm": 0.09107427503262026, "learning_rate": 9.956029117093432e-06, "loss": 0.0437, "num_tokens": 17495118.0, "step": 39 }, { "epoch": 0.3936039360393604, "grad_norm": 0.0937444613029256, "learning_rate": 9.951948205849457e-06, "loss": 0.0454, "num_tokens": 17970094.0, "step": 40 }, { "epoch": 0.4034440344403444, "grad_norm": 0.08311104466178655, "learning_rate": 9.947687244041143e-06, "loss": 0.035, "num_tokens": 18410339.0, "step": 41 }, { "epoch": 0.4132841328413284, "grad_norm": 0.08793054938548499, "learning_rate": 9.943246403994969e-06, "loss": 0.0398, "num_tokens": 18827350.0, "step": 42 }, { "epoch": 0.4231242312423124, "grad_norm": 0.0870730212694011, "learning_rate": 9.938625865312252e-06, "loss": 0.0399, "num_tokens": 19263988.0, "step": 43 }, { "epoch": 0.43296432964329645, "grad_norm": 0.09378264156348792, "learning_rate": 9.933825814861877e-06, "loss": 0.0417, "num_tokens": 19720928.0, "step": 44 }, { "epoch": 0.44280442804428044, "grad_norm": 0.09291535955010499, "learning_rate": 9.928846446772737e-06, "loss": 0.0503, "num_tokens": 20158462.0, "step": 45 }, { "epoch": 0.45264452644526443, "grad_norm": 0.09075012323845133, "learning_rate": 9.923687962425895e-06, "loss": 0.0383, "num_tokens": 20601362.0, "step": 46 }, { "epoch": 0.46248462484624847, "grad_norm": 0.08004209717985471, "learning_rate": 9.91835057044642e-06, "loss": 0.037, "num_tokens": 21072330.0, "step": 47 }, { "epoch": 0.47232472324723246, "grad_norm": 0.08493198697322972, "learning_rate": 9.912834486694963e-06, "loss": 0.0391, "num_tokens": 21525288.0, "step": 48 }, { "epoch": 0.4821648216482165, "grad_norm": 0.09568253792719325, "learning_rate": 9.907139934259025e-06, "loss": 0.0343, "num_tokens": 21953640.0, "step": 49 }, { "epoch": 0.4920049200492005, "grad_norm": 0.08303424847503821, "learning_rate": 9.90126714344393e-06, "loss": 0.0376, "num_tokens": 22435895.0, "step": 50 }, { "epoch": 0.5018450184501845, "grad_norm": 0.08402276200759969, "learning_rate": 9.895216351763515e-06, "loss": 0.0329, "num_tokens": 22883079.0, "step": 51 }, { "epoch": 0.5116851168511685, "grad_norm": 0.08739650107232434, "learning_rate": 9.888987803930523e-06, "loss": 0.0434, "num_tokens": 23334165.0, "step": 52 }, { "epoch": 0.5215252152521526, "grad_norm": 0.08000069499125344, "learning_rate": 9.882581751846707e-06, "loss": 0.0292, "num_tokens": 23797014.0, "step": 53 }, { "epoch": 0.5313653136531366, "grad_norm": 0.08649372481980258, "learning_rate": 9.87599845459264e-06, "loss": 0.0347, "num_tokens": 24288663.0, "step": 54 }, { "epoch": 0.5412054120541205, "grad_norm": 0.08523000136251592, "learning_rate": 9.869238178417235e-06, "loss": 0.0341, "num_tokens": 24710186.0, "step": 55 }, { "epoch": 0.5510455104551045, "grad_norm": 0.07465459840939084, "learning_rate": 9.862301196726988e-06, "loss": 0.0304, "num_tokens": 25160056.0, "step": 56 }, { "epoch": 0.5608856088560885, "grad_norm": 0.08503608654454574, "learning_rate": 9.855187790074906e-06, "loss": 0.0396, "num_tokens": 25598946.0, "step": 57 }, { "epoch": 0.5707257072570726, "grad_norm": 0.09884072621149297, "learning_rate": 9.847898246149173e-06, "loss": 0.0379, "num_tokens": 26066191.0, "step": 58 }, { "epoch": 0.5805658056580566, "grad_norm": 0.08138600445295054, "learning_rate": 9.840432859761504e-06, "loss": 0.0346, "num_tokens": 26518390.0, "step": 59 }, { "epoch": 0.5904059040590406, "grad_norm": 0.0914956119948303, "learning_rate": 9.832791932835232e-06, "loss": 0.0338, "num_tokens": 26971703.0, "step": 60 }, { "epoch": 0.6002460024600246, "grad_norm": 0.07667618947866506, "learning_rate": 9.824975774393089e-06, "loss": 0.0311, "num_tokens": 27413893.0, "step": 61 }, { "epoch": 0.6100861008610086, "grad_norm": 0.07239904062409902, "learning_rate": 9.816984700544714e-06, "loss": 0.0281, "num_tokens": 27868849.0, "step": 62 }, { "epoch": 0.6199261992619927, "grad_norm": 0.07354472191772266, "learning_rate": 9.808819034473869e-06, "loss": 0.0286, "num_tokens": 28286253.0, "step": 63 }, { "epoch": 0.6297662976629766, "grad_norm": 0.07644175488051343, "learning_rate": 9.800479106425356e-06, "loss": 0.0259, "num_tokens": 28699755.0, "step": 64 }, { "epoch": 0.6396063960639606, "grad_norm": 0.07705355988067097, "learning_rate": 9.791965253691687e-06, "loss": 0.0273, "num_tokens": 29142779.0, "step": 65 }, { "epoch": 0.6494464944649446, "grad_norm": 0.09001368693559736, "learning_rate": 9.783277820599408e-06, "loss": 0.0324, "num_tokens": 29564256.0, "step": 66 }, { "epoch": 0.6592865928659286, "grad_norm": 0.07998998288422189, "learning_rate": 9.774417158495208e-06, "loss": 0.0311, "num_tokens": 30035831.0, "step": 67 }, { "epoch": 0.6691266912669127, "grad_norm": 0.08094973090726079, "learning_rate": 9.765383625731683e-06, "loss": 0.0306, "num_tokens": 30471581.0, "step": 68 }, { "epoch": 0.6789667896678967, "grad_norm": 0.07575991095164336, "learning_rate": 9.756177587652857e-06, "loss": 0.0259, "num_tokens": 30914761.0, "step": 69 }, { "epoch": 0.6888068880688807, "grad_norm": 0.08255507372333148, "learning_rate": 9.746799416579403e-06, "loss": 0.0288, "num_tokens": 31351700.0, "step": 70 }, { "epoch": 0.6986469864698647, "grad_norm": 0.07383179182825, "learning_rate": 9.737249491793587e-06, "loss": 0.0322, "num_tokens": 31832339.0, "step": 71 }, { "epoch": 0.7084870848708487, "grad_norm": 0.0779807473775348, "learning_rate": 9.727528199523923e-06, "loss": 0.0384, "num_tokens": 32297687.0, "step": 72 }, { "epoch": 0.7183271832718328, "grad_norm": 0.9900117683369877, "learning_rate": 9.717635932929556e-06, "loss": 0.2712, "num_tokens": 32795919.0, "step": 73 }, { "epoch": 0.7281672816728167, "grad_norm": 0.0991868351288853, "learning_rate": 9.707573092084368e-06, "loss": 0.0419, "num_tokens": 33250913.0, "step": 74 }, { "epoch": 0.7380073800738007, "grad_norm": 0.08222641622667337, "learning_rate": 9.697340083960785e-06, "loss": 0.0289, "num_tokens": 33700970.0, "step": 75 }, { "epoch": 0.7478474784747847, "grad_norm": 0.0807953175546777, "learning_rate": 9.686937322413325e-06, "loss": 0.0286, "num_tokens": 34160528.0, "step": 76 }, { "epoch": 0.7576875768757687, "grad_norm": 0.07804479191069146, "learning_rate": 9.676365228161869e-06, "loss": 0.0288, "num_tokens": 34612256.0, "step": 77 }, { "epoch": 0.7675276752767528, "grad_norm": 0.0766842054326196, "learning_rate": 9.66562422877462e-06, "loss": 0.0232, "num_tokens": 35040279.0, "step": 78 }, { "epoch": 0.7773677736777368, "grad_norm": 0.0891517560198986, "learning_rate": 9.654714758650844e-06, "loss": 0.0297, "num_tokens": 35479921.0, "step": 79 }, { "epoch": 0.7872078720787208, "grad_norm": 0.07866162600023792, "learning_rate": 9.643637259003276e-06, "loss": 0.0246, "num_tokens": 35916545.0, "step": 80 }, { "epoch": 0.7970479704797048, "grad_norm": 0.09304350719642641, "learning_rate": 9.632392177840286e-06, "loss": 0.034, "num_tokens": 36366132.0, "step": 81 }, { "epoch": 0.8068880688806888, "grad_norm": 0.07700162361294739, "learning_rate": 9.620979969947759e-06, "loss": 0.0253, "num_tokens": 36826531.0, "step": 82 }, { "epoch": 0.8167281672816729, "grad_norm": 0.07988889172787667, "learning_rate": 9.609401096870707e-06, "loss": 0.0218, "num_tokens": 37250803.0, "step": 83 }, { "epoch": 0.8265682656826568, "grad_norm": 0.07662932628344148, "learning_rate": 9.597656026894591e-06, "loss": 0.0246, "num_tokens": 37707551.0, "step": 84 }, { "epoch": 0.8364083640836408, "grad_norm": 0.08311607561866212, "learning_rate": 9.585745235026391e-06, "loss": 0.0288, "num_tokens": 38157805.0, "step": 85 }, { "epoch": 0.8462484624846248, "grad_norm": 0.08297888399480316, "learning_rate": 9.5736692029754e-06, "loss": 0.04, "num_tokens": 38632172.0, "step": 86 }, { "epoch": 0.8560885608856088, "grad_norm": 0.07156880458651095, "learning_rate": 9.561428419133723e-06, "loss": 0.0244, "num_tokens": 39106655.0, "step": 87 }, { "epoch": 0.8659286592865929, "grad_norm": 0.07908760460903244, "learning_rate": 9.549023378556548e-06, "loss": 0.0283, "num_tokens": 39539107.0, "step": 88 }, { "epoch": 0.8757687576875769, "grad_norm": 0.07754860550749794, "learning_rate": 9.53645458294211e-06, "loss": 0.0267, "num_tokens": 39991573.0, "step": 89 }, { "epoch": 0.8856088560885609, "grad_norm": 0.07382048908783427, "learning_rate": 9.523722540611403e-06, "loss": 0.0302, "num_tokens": 40424055.0, "step": 90 }, { "epoch": 0.8954489544895449, "grad_norm": 0.07273833859487783, "learning_rate": 9.510827766487625e-06, "loss": 0.0256, "num_tokens": 40896577.0, "step": 91 }, { "epoch": 0.9052890528905289, "grad_norm": 0.07964369978108742, "learning_rate": 9.497770782075353e-06, "loss": 0.035, "num_tokens": 41366216.0, "step": 92 }, { "epoch": 0.915129151291513, "grad_norm": 0.07401188987280478, "learning_rate": 9.484552115439445e-06, "loss": 0.0245, "num_tokens": 41809244.0, "step": 93 }, { "epoch": 0.9249692496924969, "grad_norm": 0.07840443321039077, "learning_rate": 9.471172301183695e-06, "loss": 0.0289, "num_tokens": 42248913.0, "step": 94 }, { "epoch": 0.9348093480934809, "grad_norm": 0.08325366129705902, "learning_rate": 9.4576318804292e-06, "loss": 0.0231, "num_tokens": 42692031.0, "step": 95 }, { "epoch": 0.9446494464944649, "grad_norm": 0.07523171897596297, "learning_rate": 9.443931400792486e-06, "loss": 0.0231, "num_tokens": 43152827.0, "step": 96 }, { "epoch": 0.9544895448954489, "grad_norm": 0.08414955879961411, "learning_rate": 9.430071416363352e-06, "loss": 0.0361, "num_tokens": 43589003.0, "step": 97 }, { "epoch": 0.964329643296433, "grad_norm": 0.07780017154939398, "learning_rate": 9.416052487682465e-06, "loss": 0.0325, "num_tokens": 44039888.0, "step": 98 }, { "epoch": 0.974169741697417, "grad_norm": 0.07812625211660923, "learning_rate": 9.401875181718686e-06, "loss": 0.0269, "num_tokens": 44477755.0, "step": 99 }, { "epoch": 0.984009840098401, "grad_norm": 0.08986184340836745, "learning_rate": 9.387540071846155e-06, "loss": 0.028, "num_tokens": 44942597.0, "step": 100 }, { "epoch": 0.993849938499385, "grad_norm": 0.07602588052869161, "learning_rate": 9.373047737821078e-06, "loss": 0.0263, "num_tokens": 45381474.0, "step": 101 }, { "epoch": 1.0, "grad_norm": 0.07602588052869161, "learning_rate": 9.358398765758296e-06, "loss": 0.0233, "num_tokens": 45594452.0, "step": 102 }, { "epoch": 1.009840098400984, "grad_norm": 0.10983513675872132, "learning_rate": 9.34359374810758e-06, "loss": 0.0203, "num_tokens": 46030876.0, "step": 103 }, { "epoch": 1.019680196801968, "grad_norm": 0.06573014575776571, "learning_rate": 9.328633283629666e-06, "loss": 0.0227, "num_tokens": 46486828.0, "step": 104 }, { "epoch": 1.029520295202952, "grad_norm": 0.08034715377363097, "learning_rate": 9.31351797737204e-06, "loss": 0.0318, "num_tokens": 46952208.0, "step": 105 }, { "epoch": 1.039360393603936, "grad_norm": 0.07429105498387009, "learning_rate": 9.29824844064447e-06, "loss": 0.0303, "num_tokens": 47410999.0, "step": 106 }, { "epoch": 1.04920049200492, "grad_norm": 0.08198831738294611, "learning_rate": 9.282825290994282e-06, "loss": 0.0214, "num_tokens": 47859777.0, "step": 107 }, { "epoch": 1.0590405904059041, "grad_norm": 0.08882279229683457, "learning_rate": 9.267249152181379e-06, "loss": 0.0348, "num_tokens": 48313998.0, "step": 108 }, { "epoch": 1.068880688806888, "grad_norm": 0.07979621896563821, "learning_rate": 9.251520654153028e-06, "loss": 0.0217, "num_tokens": 48768989.0, "step": 109 }, { "epoch": 1.0787207872078721, "grad_norm": 0.07487008128556895, "learning_rate": 9.235640433018363e-06, "loss": 0.0312, "num_tokens": 49228144.0, "step": 110 }, { "epoch": 1.088560885608856, "grad_norm": 0.07679612898308323, "learning_rate": 9.219609131022684e-06, "loss": 0.0931, "num_tokens": 49660848.0, "step": 111 }, { "epoch": 1.09840098400984, "grad_norm": 0.2760363965156351, "learning_rate": 9.203427396521454e-06, "loss": 0.0199, "num_tokens": 50101951.0, "step": 112 }, { "epoch": 1.1082410824108242, "grad_norm": 0.0785172085157219, "learning_rate": 9.187095883954104e-06, "loss": 0.0249, "num_tokens": 50561884.0, "step": 113 }, { "epoch": 1.118081180811808, "grad_norm": 0.07511192184574486, "learning_rate": 9.170615253817547e-06, "loss": 0.0202, "num_tokens": 51023121.0, "step": 114 }, { "epoch": 1.1279212792127922, "grad_norm": 0.0788369940517159, "learning_rate": 9.153986172639474e-06, "loss": 0.0242, "num_tokens": 51475717.0, "step": 115 }, { "epoch": 1.137761377613776, "grad_norm": 0.08125072620630153, "learning_rate": 9.137209312951395e-06, "loss": 0.023, "num_tokens": 51936171.0, "step": 116 }, { "epoch": 1.1476014760147601, "grad_norm": 0.07976122735928637, "learning_rate": 9.12028535326144e-06, "loss": 0.0243, "num_tokens": 52386546.0, "step": 117 }, { "epoch": 1.1574415744157442, "grad_norm": 0.08346012008591713, "learning_rate": 9.103214978026922e-06, "loss": 0.0325, "num_tokens": 52836033.0, "step": 118 }, { "epoch": 1.1672816728167281, "grad_norm": 0.11906435467388006, "learning_rate": 9.085998877626644e-06, "loss": 0.0322, "num_tokens": 53268007.0, "step": 119 }, { "epoch": 1.1771217712177122, "grad_norm": 0.07249411043595039, "learning_rate": 9.068637748332993e-06, "loss": 0.022, "num_tokens": 53704420.0, "step": 120 }, { "epoch": 1.186961869618696, "grad_norm": 0.07720155000392087, "learning_rate": 9.051132292283772e-06, "loss": 0.0175, "num_tokens": 54139993.0, "step": 121 }, { "epoch": 1.1968019680196802, "grad_norm": 0.07940921390702492, "learning_rate": 9.033483217453801e-06, "loss": 0.028, "num_tokens": 54570992.0, "step": 122 }, { "epoch": 1.2066420664206643, "grad_norm": 0.07451157010744122, "learning_rate": 9.015691237626292e-06, "loss": 0.0372, "num_tokens": 55010129.0, "step": 123 }, { "epoch": 1.2164821648216482, "grad_norm": 0.0847625901422177, "learning_rate": 8.997757072363976e-06, "loss": 0.026, "num_tokens": 55484006.0, "step": 124 }, { "epoch": 1.2263222632226323, "grad_norm": 0.0734447382667372, "learning_rate": 8.979681446980002e-06, "loss": 0.0187, "num_tokens": 55916003.0, "step": 125 }, { "epoch": 1.2361623616236161, "grad_norm": 0.07720210069754488, "learning_rate": 8.961465092508607e-06, "loss": 0.0204, "num_tokens": 56367051.0, "step": 126 }, { "epoch": 1.2460024600246002, "grad_norm": 0.07005770488907263, "learning_rate": 8.943108745675542e-06, "loss": 0.0191, "num_tokens": 56809919.0, "step": 127 }, { "epoch": 1.2558425584255843, "grad_norm": 0.070753857700872, "learning_rate": 8.92461314886829e-06, "loss": 0.0166, "num_tokens": 57239307.0, "step": 128 }, { "epoch": 1.2656826568265682, "grad_norm": 0.07412229123559273, "learning_rate": 8.905979050106029e-06, "loss": 0.0182, "num_tokens": 57659139.0, "step": 129 }, { "epoch": 1.2755227552275523, "grad_norm": 0.07409420522842904, "learning_rate": 8.887207203009385e-06, "loss": 0.0182, "num_tokens": 58106309.0, "step": 130 }, { "epoch": 1.2853628536285364, "grad_norm": 0.07792555021983492, "learning_rate": 8.868298366769956e-06, "loss": 0.02, "num_tokens": 58533301.0, "step": 131 }, { "epoch": 1.2952029520295203, "grad_norm": 0.07755261181847745, "learning_rate": 8.849253306119601e-06, "loss": 0.018, "num_tokens": 58966567.0, "step": 132 }, { "epoch": 1.3050430504305042, "grad_norm": 0.07236047683074463, "learning_rate": 8.83007279129952e-06, "loss": 0.0231, "num_tokens": 59420421.0, "step": 133 }, { "epoch": 1.3148831488314883, "grad_norm": 0.07698814451544558, "learning_rate": 8.810757598029094e-06, "loss": 0.0171, "num_tokens": 59841646.0, "step": 134 }, { "epoch": 1.3247232472324724, "grad_norm": 0.07814813953683186, "learning_rate": 8.79130850747452e-06, "loss": 0.02, "num_tokens": 60262792.0, "step": 135 }, { "epoch": 1.3345633456334562, "grad_norm": 0.06873838482423762, "learning_rate": 8.771726306217217e-06, "loss": 0.0185, "num_tokens": 60732922.0, "step": 136 }, { "epoch": 1.3444034440344403, "grad_norm": 0.09687913725758614, "learning_rate": 8.752011786222011e-06, "loss": 0.0176, "num_tokens": 61205715.0, "step": 137 }, { "epoch": 1.3542435424354244, "grad_norm": 0.07245054518431464, "learning_rate": 8.732165744805107e-06, "loss": 0.0235, "num_tokens": 61643580.0, "step": 138 }, { "epoch": 1.3640836408364083, "grad_norm": 0.0751731684247356, "learning_rate": 8.712188984601845e-06, "loss": 0.0204, "num_tokens": 62086664.0, "step": 139 }, { "epoch": 1.3739237392373924, "grad_norm": 0.0765957432328115, "learning_rate": 8.692082313534233e-06, "loss": 0.0246, "num_tokens": 62556237.0, "step": 140 }, { "epoch": 1.3837638376383765, "grad_norm": 0.07942748233411417, "learning_rate": 8.671846544778284e-06, "loss": 0.0262, "num_tokens": 63014753.0, "step": 141 }, { "epoch": 1.3936039360393604, "grad_norm": 0.07220660807543852, "learning_rate": 8.651482496731116e-06, "loss": 0.0206, "num_tokens": 63462646.0, "step": 142 }, { "epoch": 1.4034440344403443, "grad_norm": 0.07123468502943975, "learning_rate": 8.630990992977854e-06, "loss": 0.0171, "num_tokens": 63953362.0, "step": 143 }, { "epoch": 1.4132841328413284, "grad_norm": 0.06845259344800432, "learning_rate": 8.61037286225834e-06, "loss": 0.0176, "num_tokens": 64407104.0, "step": 144 }, { "epoch": 1.4231242312423125, "grad_norm": 0.0712728854892911, "learning_rate": 8.589628938433587e-06, "loss": 0.0198, "num_tokens": 64843926.0, "step": 145 }, { "epoch": 1.4329643296432963, "grad_norm": 0.07477061506965738, "learning_rate": 8.56876006045208e-06, "loss": 0.0186, "num_tokens": 65296311.0, "step": 146 }, { "epoch": 1.4428044280442804, "grad_norm": 0.06777146676082023, "learning_rate": 8.547767072315835e-06, "loss": 0.019, "num_tokens": 65756092.0, "step": 147 }, { "epoch": 1.4526445264452645, "grad_norm": 0.07114975862038111, "learning_rate": 8.526650823046266e-06, "loss": 0.0178, "num_tokens": 66217555.0, "step": 148 }, { "epoch": 1.4624846248462484, "grad_norm": 0.07525940320393967, "learning_rate": 8.505412166649847e-06, "loss": 0.0179, "num_tokens": 66685889.0, "step": 149 }, { "epoch": 1.4723247232472325, "grad_norm": 0.06408400112895582, "learning_rate": 8.484051962083579e-06, "loss": 0.0218, "num_tokens": 67133723.0, "step": 150 }, { "epoch": 1.4821648216482166, "grad_norm": 0.28687790559136056, "learning_rate": 8.462571073220243e-06, "loss": 0.0384, "num_tokens": 67578744.0, "step": 151 }, { "epoch": 1.4920049200492005, "grad_norm": 0.0793310962273527, "learning_rate": 8.44097036881347e-06, "loss": 0.0229, "num_tokens": 68014169.0, "step": 152 }, { "epoch": 1.5018450184501844, "grad_norm": 0.06714316191575523, "learning_rate": 8.419250722462603e-06, "loss": 0.0171, "num_tokens": 68463185.0, "step": 153 }, { "epoch": 1.5116851168511685, "grad_norm": 0.07317824154830675, "learning_rate": 8.39741301257736e-06, "loss": 0.0168, "num_tokens": 68915214.0, "step": 154 }, { "epoch": 1.5215252152521526, "grad_norm": 0.07498605748005185, "learning_rate": 8.375458122342317e-06, "loss": 0.0218, "num_tokens": 69372682.0, "step": 155 }, { "epoch": 1.5313653136531364, "grad_norm": 0.07930974741179314, "learning_rate": 8.353386939681186e-06, "loss": 0.0197, "num_tokens": 69856887.0, "step": 156 }, { "epoch": 1.5412054120541205, "grad_norm": 0.06557036848250264, "learning_rate": 8.331200357220908e-06, "loss": 0.0169, "num_tokens": 70281087.0, "step": 157 }, { "epoch": 1.5510455104551046, "grad_norm": 0.06911145883255942, "learning_rate": 8.308899272255542e-06, "loss": 0.0258, "num_tokens": 70703636.0, "step": 158 }, { "epoch": 1.5608856088560885, "grad_norm": 0.07694897194945108, "learning_rate": 8.286484586709989e-06, "loss": 0.0195, "num_tokens": 71151769.0, "step": 159 }, { "epoch": 1.5707257072570726, "grad_norm": 0.07509068702973927, "learning_rate": 8.263957207103506e-06, "loss": 0.0163, "num_tokens": 71608512.0, "step": 160 }, { "epoch": 1.5805658056580567, "grad_norm": 0.06708625688834685, "learning_rate": 8.241318044513046e-06, "loss": 0.0182, "num_tokens": 72064241.0, "step": 161 }, { "epoch": 1.5904059040590406, "grad_norm": 0.07409751654155339, "learning_rate": 8.218568014536414e-06, "loss": 0.0214, "num_tokens": 72529560.0, "step": 162 }, { "epoch": 1.6002460024600245, "grad_norm": 0.08398773553611014, "learning_rate": 8.195708037255233e-06, "loss": 0.0219, "num_tokens": 72983545.0, "step": 163 }, { "epoch": 1.6100861008610086, "grad_norm": 0.0924163975250339, "learning_rate": 8.172739037197739e-06, "loss": 0.0199, "num_tokens": 73420020.0, "step": 164 }, { "epoch": 1.6199261992619927, "grad_norm": 0.07341361509627292, "learning_rate": 8.149661943301382e-06, "loss": 0.0193, "num_tokens": 73867089.0, "step": 165 }, { "epoch": 1.6297662976629765, "grad_norm": 0.09734659605562107, "learning_rate": 8.126477688875262e-06, "loss": 0.0393, "num_tokens": 74337960.0, "step": 166 }, { "epoch": 1.6396063960639606, "grad_norm": 0.0653922306331562, "learning_rate": 8.103187211562386e-06, "loss": 0.0169, "num_tokens": 74779558.0, "step": 167 }, { "epoch": 1.6494464944649447, "grad_norm": 0.07532255669592061, "learning_rate": 8.079791453301742e-06, "loss": 0.0181, "num_tokens": 75249832.0, "step": 168 }, { "epoch": 1.6592865928659286, "grad_norm": 0.06592599083206258, "learning_rate": 8.056291360290202e-06, "loss": 0.0189, "num_tokens": 75680939.0, "step": 169 }, { "epoch": 1.6691266912669127, "grad_norm": 0.07466148209100053, "learning_rate": 8.032687882944264e-06, "loss": 0.0197, "num_tokens": 76131450.0, "step": 170 }, { "epoch": 1.6789667896678968, "grad_norm": 0.06887573727890169, "learning_rate": 8.0089819758616e-06, "loss": 0.0192, "num_tokens": 76593750.0, "step": 171 }, { "epoch": 1.6888068880688807, "grad_norm": 0.06693500545746488, "learning_rate": 7.985174597782469e-06, "loss": 0.0184, "num_tokens": 77050866.0, "step": 172 }, { "epoch": 1.6986469864698646, "grad_norm": 0.0743814094089007, "learning_rate": 7.961266711550922e-06, "loss": 0.018, "num_tokens": 77464839.0, "step": 173 }, { "epoch": 1.7084870848708487, "grad_norm": 0.07491525806289706, "learning_rate": 7.937259284075872e-06, "loss": 0.0242, "num_tokens": 77916642.0, "step": 174 }, { "epoch": 1.7183271832718328, "grad_norm": 0.06409329354600994, "learning_rate": 7.913153286291995e-06, "loss": 0.0159, "num_tokens": 78389781.0, "step": 175 }, { "epoch": 1.7281672816728166, "grad_norm": 0.07234553574283793, "learning_rate": 7.888949693120443e-06, "loss": 0.0185, "num_tokens": 78840356.0, "step": 176 }, { "epoch": 1.7380073800738007, "grad_norm": 0.06775336526687877, "learning_rate": 7.864649483429442e-06, "loss": 0.0222, "num_tokens": 79280780.0, "step": 177 }, { "epoch": 1.7478474784747848, "grad_norm": 0.06875670670455086, "learning_rate": 7.840253639994676e-06, "loss": 0.2282, "num_tokens": 79776798.0, "step": 178 }, { "epoch": 1.7576875768757687, "grad_norm": 0.6287911227006427, "learning_rate": 7.815763149459563e-06, "loss": 0.0278, "num_tokens": 80233063.0, "step": 179 }, { "epoch": 1.7675276752767528, "grad_norm": 0.0846354199885941, "learning_rate": 7.791179002295334e-06, "loss": 0.0176, "num_tokens": 80677429.0, "step": 180 }, { "epoch": 1.777367773677737, "grad_norm": 0.07041805151413434, "learning_rate": 7.766502192760995e-06, "loss": 0.019, "num_tokens": 81122406.0, "step": 181 }, { "epoch": 1.7872078720787208, "grad_norm": 0.0643818791121022, "learning_rate": 7.741733718863096e-06, "loss": 0.021, "num_tokens": 81570725.0, "step": 182 }, { "epoch": 1.7970479704797047, "grad_norm": 0.09698204542146324, "learning_rate": 7.71687458231538e-06, "loss": 0.0164, "num_tokens": 82009994.0, "step": 183 }, { "epoch": 1.8068880688806888, "grad_norm": 0.06854732624004896, "learning_rate": 7.69192578849827e-06, "loss": 0.0166, "num_tokens": 82455315.0, "step": 184 }, { "epoch": 1.8167281672816729, "grad_norm": 0.08363256711677616, "learning_rate": 7.666888346418205e-06, "loss": 0.026, "num_tokens": 82901866.0, "step": 185 }, { "epoch": 1.8265682656826567, "grad_norm": 0.07243825921308177, "learning_rate": 7.641763268666832e-06, "loss": 0.019, "num_tokens": 83346981.0, "step": 186 }, { "epoch": 1.8364083640836408, "grad_norm": 0.06930096159612371, "learning_rate": 7.616551571380061e-06, "loss": 0.0214, "num_tokens": 83806206.0, "step": 187 }, { "epoch": 1.846248462484625, "grad_norm": 0.0681854243495615, "learning_rate": 7.5912542741969585e-06, "loss": 0.0163, "num_tokens": 84274704.0, "step": 188 }, { "epoch": 1.8560885608856088, "grad_norm": 0.07242016185622592, "learning_rate": 7.5658724002185215e-06, "loss": 0.0176, "num_tokens": 84711534.0, "step": 189 }, { "epoch": 1.865928659286593, "grad_norm": 0.06579105822967504, "learning_rate": 7.54040697596629e-06, "loss": 0.0283, "num_tokens": 85160170.0, "step": 190 }, { "epoch": 1.875768757687577, "grad_norm": 0.07108784887648237, "learning_rate": 7.514859031340835e-06, "loss": 0.0396, "num_tokens": 85613402.0, "step": 191 }, { "epoch": 1.8856088560885609, "grad_norm": 0.1388795021837048, "learning_rate": 7.489229599580111e-06, "loss": 0.0246, "num_tokens": 86052381.0, "step": 192 }, { "epoch": 1.8954489544895448, "grad_norm": 0.07205397955257539, "learning_rate": 7.463519717217663e-06, "loss": 0.0168, "num_tokens": 86528412.0, "step": 193 }, { "epoch": 1.9052890528905289, "grad_norm": 0.06405950191637912, "learning_rate": 7.437730424040702e-06, "loss": 0.0172, "num_tokens": 86961573.0, "step": 194 }, { "epoch": 1.915129151291513, "grad_norm": 0.06777861356344914, "learning_rate": 7.411862763048068e-06, "loss": 0.2543, "num_tokens": 87398765.0, "step": 195 }, { "epoch": 1.9249692496924968, "grad_norm": 0.6838887307240747, "learning_rate": 7.38591778040803e-06, "loss": 0.0173, "num_tokens": 87844520.0, "step": 196 }, { "epoch": 1.934809348093481, "grad_norm": 0.0776240582840387, "learning_rate": 7.359896525415986e-06, "loss": 0.017, "num_tokens": 88283922.0, "step": 197 }, { "epoch": 1.944649446494465, "grad_norm": 0.07000567204619491, "learning_rate": 7.333800050452024e-06, "loss": 0.017, "num_tokens": 88751425.0, "step": 198 }, { "epoch": 1.954489544895449, "grad_norm": 0.06845270762016023, "learning_rate": 7.307629410938364e-06, "loss": 0.0156, "num_tokens": 89205645.0, "step": 199 }, { "epoch": 1.964329643296433, "grad_norm": 0.06368141241550528, "learning_rate": 7.281385665296663e-06, "loss": 0.0241, "num_tokens": 89660304.0, "step": 200 }, { "epoch": 1.974169741697417, "grad_norm": 0.07288122003316398, "learning_rate": 7.255069874905221e-06, "loss": 0.0177, "num_tokens": 90107418.0, "step": 201 }, { "epoch": 1.984009840098401, "grad_norm": 0.06287805035179808, "learning_rate": 7.228683104056051e-06, "loss": 0.0162, "num_tokens": 90539742.0, "step": 202 }, { "epoch": 1.9938499384993849, "grad_norm": 0.0689550244527601, "learning_rate": 7.202226419911832e-06, "loss": 0.0158, "num_tokens": 90980046.0, "step": 203 }, { "epoch": 2.0, "grad_norm": 0.08227420282080453, "learning_rate": 7.175700892462757e-06, "loss": 0.0146, "num_tokens": 91185077.0, "step": 204 }, { "epoch": 2.009840098400984, "grad_norm": 0.07497487377417848, "learning_rate": 7.149107594483251e-06, "loss": 0.0179, "num_tokens": 91614882.0, "step": 205 }, { "epoch": 2.019680196801968, "grad_norm": 0.06696426999273204, "learning_rate": 7.122447601488592e-06, "loss": 0.0138, "num_tokens": 92080175.0, "step": 206 }, { "epoch": 2.029520295202952, "grad_norm": 0.6914326541667255, "learning_rate": 7.095721991691411e-06, "loss": 0.0906, "num_tokens": 92537610.0, "step": 207 }, { "epoch": 2.039360393603936, "grad_norm": 0.07698410304489973, "learning_rate": 7.0689318459580845e-06, "loss": 0.014, "num_tokens": 93006593.0, "step": 208 }, { "epoch": 2.0492004920049203, "grad_norm": 0.06605656882870746, "learning_rate": 7.042078247765019e-06, "loss": 0.0135, "num_tokens": 93476972.0, "step": 209 }, { "epoch": 2.059040590405904, "grad_norm": 0.20519577755096738, "learning_rate": 7.015162283154843e-06, "loss": 0.0448, "num_tokens": 93923596.0, "step": 210 }, { "epoch": 2.068880688806888, "grad_norm": 0.07138995831165522, "learning_rate": 6.988185040692469e-06, "loss": 0.0147, "num_tokens": 94363955.0, "step": 211 }, { "epoch": 2.078720787207872, "grad_norm": 0.07243575038647761, "learning_rate": 6.961147611421076e-06, "loss": 0.0133, "num_tokens": 94822454.0, "step": 212 }, { "epoch": 2.088560885608856, "grad_norm": 0.06470408361373992, "learning_rate": 6.934051088817988e-06, "loss": 0.0129, "num_tokens": 95281989.0, "step": 213 }, { "epoch": 2.09840098400984, "grad_norm": 0.06874194166286132, "learning_rate": 6.906896568750441e-06, "loss": 0.0133, "num_tokens": 95718050.0, "step": 214 }, { "epoch": 2.108241082410824, "grad_norm": 0.06876988398180649, "learning_rate": 6.87968514943127e-06, "loss": 0.0131, "num_tokens": 96162240.0, "step": 215 }, { "epoch": 2.1180811808118083, "grad_norm": 0.06407419572232166, "learning_rate": 6.852417931374494e-06, "loss": 0.0154, "num_tokens": 96606411.0, "step": 216 }, { "epoch": 2.127921279212792, "grad_norm": 0.07884002350292682, "learning_rate": 6.825096017350807e-06, "loss": 0.0145, "num_tokens": 97056671.0, "step": 217 }, { "epoch": 2.137761377613776, "grad_norm": 0.9981711556732346, "learning_rate": 6.797720512342967e-06, "loss": 0.2386, "num_tokens": 97514395.0, "step": 218 }, { "epoch": 2.14760147601476, "grad_norm": 0.08409368119046862, "learning_rate": 6.77029252350113e-06, "loss": 0.0131, "num_tokens": 97963358.0, "step": 219 }, { "epoch": 2.1574415744157442, "grad_norm": 0.07248619207767666, "learning_rate": 6.742813160098054e-06, "loss": 0.0209, "num_tokens": 98418943.0, "step": 220 }, { "epoch": 2.167281672816728, "grad_norm": 0.0644855598598356, "learning_rate": 6.715283533484242e-06, "loss": 0.013, "num_tokens": 98877761.0, "step": 221 }, { "epoch": 2.177121771217712, "grad_norm": 0.06622877203564365, "learning_rate": 6.6877047570430044e-06, "loss": 0.0142, "num_tokens": 99319699.0, "step": 222 }, { "epoch": 2.1869618696186963, "grad_norm": 0.07112191399232934, "learning_rate": 6.660077946145412e-06, "loss": 0.0141, "num_tokens": 99741161.0, "step": 223 }, { "epoch": 2.19680196801968, "grad_norm": 0.06421873614899783, "learning_rate": 6.632404218105205e-06, "loss": 0.0125, "num_tokens": 100166108.0, "step": 224 }, { "epoch": 2.206642066420664, "grad_norm": 0.10033810460897824, "learning_rate": 6.604684692133597e-06, "loss": 0.0252, "num_tokens": 100619482.0, "step": 225 }, { "epoch": 2.2164821648216484, "grad_norm": 0.06530150422818083, "learning_rate": 6.576920489294011e-06, "loss": 0.0127, "num_tokens": 101046330.0, "step": 226 }, { "epoch": 2.2263222632226323, "grad_norm": 0.07185919142366148, "learning_rate": 6.549112732456739e-06, "loss": 0.016, "num_tokens": 101495504.0, "step": 227 }, { "epoch": 2.236162361623616, "grad_norm": 0.06818636079419191, "learning_rate": 6.5212625462535365e-06, "loss": 0.0213, "num_tokens": 101968574.0, "step": 228 }, { "epoch": 2.2460024600246005, "grad_norm": 0.06992643360327205, "learning_rate": 6.493371057032129e-06, "loss": 0.0129, "num_tokens": 102400777.0, "step": 229 }, { "epoch": 2.2558425584255843, "grad_norm": 0.07704204356207317, "learning_rate": 6.465439392810664e-06, "loss": 0.015, "num_tokens": 102843075.0, "step": 230 }, { "epoch": 2.265682656826568, "grad_norm": 0.06544957942318019, "learning_rate": 6.4374686832320944e-06, "loss": 0.0123, "num_tokens": 103261288.0, "step": 231 }, { "epoch": 2.275522755227552, "grad_norm": 0.062334728387161344, "learning_rate": 6.409460059518482e-06, "loss": 0.0129, "num_tokens": 103717825.0, "step": 232 }, { "epoch": 2.2853628536285364, "grad_norm": 0.06332858275476305, "learning_rate": 6.381414654425261e-06, "loss": 0.0206, "num_tokens": 104165764.0, "step": 233 }, { "epoch": 2.2952029520295203, "grad_norm": 0.09448999187172064, "learning_rate": 6.353333602195414e-06, "loss": 0.0124, "num_tokens": 104606652.0, "step": 234 }, { "epoch": 2.305043050430504, "grad_norm": 0.0668177771084121, "learning_rate": 6.325218038513604e-06, "loss": 0.0126, "num_tokens": 105039670.0, "step": 235 }, { "epoch": 2.3148831488314885, "grad_norm": 0.06130143066732146, "learning_rate": 6.2970691004602425e-06, "loss": 0.0128, "num_tokens": 105498133.0, "step": 236 }, { "epoch": 2.3247232472324724, "grad_norm": 0.06353571099299025, "learning_rate": 6.26888792646551e-06, "loss": 0.0132, "num_tokens": 105955337.0, "step": 237 }, { "epoch": 2.3345633456334562, "grad_norm": 0.07072744488591255, "learning_rate": 6.240675656263303e-06, "loss": 0.0206, "num_tokens": 106398065.0, "step": 238 }, { "epoch": 2.34440344403444, "grad_norm": 0.0701393753248028, "learning_rate": 6.212433430845145e-06, "loss": 0.0207, "num_tokens": 106835350.0, "step": 239 }, { "epoch": 2.3542435424354244, "grad_norm": 0.06848300507150402, "learning_rate": 6.184162392414044e-06, "loss": 0.0145, "num_tokens": 107296896.0, "step": 240 }, { "epoch": 2.3640836408364083, "grad_norm": 0.07142535679112567, "learning_rate": 6.155863684338294e-06, "loss": 0.0142, "num_tokens": 107740689.0, "step": 241 }, { "epoch": 2.373923739237392, "grad_norm": 0.061123970206678674, "learning_rate": 6.127538451105232e-06, "loss": 0.0127, "num_tokens": 108196343.0, "step": 242 }, { "epoch": 2.3837638376383765, "grad_norm": 0.0699480717937599, "learning_rate": 6.099187838274959e-06, "loss": 0.0136, "num_tokens": 108646263.0, "step": 243 }, { "epoch": 2.3936039360393604, "grad_norm": 0.06593159068712609, "learning_rate": 6.070812992434003e-06, "loss": 0.0127, "num_tokens": 109089367.0, "step": 244 }, { "epoch": 2.4034440344403443, "grad_norm": 0.07004342519131408, "learning_rate": 6.042415061148954e-06, "loss": 0.015, "num_tokens": 109534992.0, "step": 245 }, { "epoch": 2.4132841328413286, "grad_norm": 0.08012326478595262, "learning_rate": 6.013995192920044e-06, "loss": 0.0181, "num_tokens": 110020906.0, "step": 246 }, { "epoch": 2.4231242312423125, "grad_norm": 0.06662874441753898, "learning_rate": 5.985554537134702e-06, "loss": 0.0125, "num_tokens": 110456126.0, "step": 247 }, { "epoch": 2.4329643296432963, "grad_norm": 0.06935993203158428, "learning_rate": 5.957094244021071e-06, "loss": 0.0124, "num_tokens": 110889342.0, "step": 248 }, { "epoch": 2.4428044280442807, "grad_norm": 0.06948156606308496, "learning_rate": 5.928615464601497e-06, "loss": 0.0143, "num_tokens": 111329838.0, "step": 249 }, { "epoch": 2.4526445264452645, "grad_norm": 0.08174081643789106, "learning_rate": 5.900119350645956e-06, "loss": 0.0158, "num_tokens": 111789840.0, "step": 250 }, { "epoch": 2.4624846248462484, "grad_norm": 0.07150394939916561, "learning_rate": 5.871607054625497e-06, "loss": 0.0189, "num_tokens": 112240250.0, "step": 251 }, { "epoch": 2.4723247232472323, "grad_norm": 0.0795332410978433, "learning_rate": 5.8430797296656125e-06, "loss": 0.0211, "num_tokens": 112673933.0, "step": 252 }, { "epoch": 2.4821648216482166, "grad_norm": 0.0689891322716241, "learning_rate": 5.814538529499622e-06, "loss": 0.0148, "num_tokens": 113134668.0, "step": 253 }, { "epoch": 2.4920049200492005, "grad_norm": 0.07982357004600245, "learning_rate": 5.785984608421993e-06, "loss": 0.016, "num_tokens": 113596645.0, "step": 254 }, { "epoch": 2.5018450184501844, "grad_norm": 0.07075114056461625, "learning_rate": 5.757419121241667e-06, "loss": 0.0184, "num_tokens": 114038969.0, "step": 255 }, { "epoch": 2.5116851168511687, "grad_norm": 0.07123055043084275, "learning_rate": 5.7288432232353615e-06, "loss": 0.0172, "num_tokens": 114476716.0, "step": 256 }, { "epoch": 2.5215252152521526, "grad_norm": 0.06238876313147613, "learning_rate": 5.7002580701008325e-06, "loss": 0.0134, "num_tokens": 114958206.0, "step": 257 }, { "epoch": 2.5313653136531364, "grad_norm": 0.06444872364781601, "learning_rate": 5.6716648179101445e-06, "loss": 0.0122, "num_tokens": 115415134.0, "step": 258 }, { "epoch": 2.5412054120541203, "grad_norm": 0.06263958198396931, "learning_rate": 5.64306462306291e-06, "loss": 0.0142, "num_tokens": 115898609.0, "step": 259 }, { "epoch": 2.5510455104551046, "grad_norm": 0.07604741062856764, "learning_rate": 5.614458642239534e-06, "loss": 0.0121, "num_tokens": 116348874.0, "step": 260 }, { "epoch": 2.5608856088560885, "grad_norm": 0.07535603766213132, "learning_rate": 5.585848032354411e-06, "loss": 0.0187, "num_tokens": 116822359.0, "step": 261 }, { "epoch": 2.570725707257073, "grad_norm": 0.07043618175110689, "learning_rate": 5.557233950509159e-06, "loss": 0.0332, "num_tokens": 117246814.0, "step": 262 }, { "epoch": 2.5805658056580567, "grad_norm": 0.20105726012087, "learning_rate": 5.528617553945807e-06, "loss": 0.0128, "num_tokens": 117708546.0, "step": 263 }, { "epoch": 2.5904059040590406, "grad_norm": 0.07781337694485964, "learning_rate": 5.500000000000001e-06, "loss": 0.0138, "num_tokens": 118145045.0, "step": 264 }, { "epoch": 2.6002460024600245, "grad_norm": 0.06125312067867148, "learning_rate": 5.4713824460541964e-06, "loss": 0.0122, "num_tokens": 118586755.0, "step": 265 }, { "epoch": 2.6100861008610083, "grad_norm": 0.06778243090705331, "learning_rate": 5.442766049490843e-06, "loss": 0.0242, "num_tokens": 119025525.0, "step": 266 }, { "epoch": 2.6199261992619927, "grad_norm": 0.06153415408443227, "learning_rate": 5.414151967645591e-06, "loss": 0.0125, "num_tokens": 119479378.0, "step": 267 }, { "epoch": 2.6297662976629765, "grad_norm": 0.06942223993965259, "learning_rate": 5.385541357760469e-06, "loss": 0.0243, "num_tokens": 119937308.0, "step": 268 }, { "epoch": 2.639606396063961, "grad_norm": 0.06214738625923835, "learning_rate": 5.35693537693709e-06, "loss": 0.0131, "num_tokens": 120395822.0, "step": 269 }, { "epoch": 2.6494464944649447, "grad_norm": 0.0669104820126879, "learning_rate": 5.3283351820898586e-06, "loss": 0.013, "num_tokens": 120868183.0, "step": 270 }, { "epoch": 2.6592865928659286, "grad_norm": 0.0647734711741491, "learning_rate": 5.299741929899171e-06, "loss": 0.0123, "num_tokens": 121321350.0, "step": 271 }, { "epoch": 2.6691266912669125, "grad_norm": 0.07530586770614496, "learning_rate": 5.27115677676464e-06, "loss": 0.0148, "num_tokens": 121767518.0, "step": 272 }, { "epoch": 2.678966789667897, "grad_norm": 0.060785296202459226, "learning_rate": 5.242580878758334e-06, "loss": 0.0113, "num_tokens": 122214099.0, "step": 273 }, { "epoch": 2.6888068880688807, "grad_norm": 0.06051260316507515, "learning_rate": 5.21401539157801e-06, "loss": 0.0118, "num_tokens": 122641818.0, "step": 274 }, { "epoch": 2.6986469864698646, "grad_norm": 0.06697146315594861, "learning_rate": 5.1854614705003796e-06, "loss": 0.0129, "num_tokens": 123073540.0, "step": 275 }, { "epoch": 2.708487084870849, "grad_norm": 0.06445807540084068, "learning_rate": 5.156920270334389e-06, "loss": 0.011, "num_tokens": 123534092.0, "step": 276 }, { "epoch": 2.7183271832718328, "grad_norm": 0.06525833653570826, "learning_rate": 5.1283929453745055e-06, "loss": 0.0134, "num_tokens": 123958744.0, "step": 277 }, { "epoch": 2.7281672816728166, "grad_norm": 0.05852868843858968, "learning_rate": 5.099880649354044e-06, "loss": 0.0111, "num_tokens": 124403705.0, "step": 278 }, { "epoch": 2.7380073800738005, "grad_norm": 0.07241672197036282, "learning_rate": 5.071384535398505e-06, "loss": 0.0127, "num_tokens": 124889693.0, "step": 279 }, { "epoch": 2.747847478474785, "grad_norm": 0.06559889162939202, "learning_rate": 5.04290575597893e-06, "loss": 0.0141, "num_tokens": 125351837.0, "step": 280 }, { "epoch": 2.7576875768757687, "grad_norm": 0.06353700618816872, "learning_rate": 5.0144454628653015e-06, "loss": 0.0119, "num_tokens": 125793504.0, "step": 281 }, { "epoch": 2.767527675276753, "grad_norm": 0.06683615854915595, "learning_rate": 4.986004807079959e-06, "loss": 0.0129, "num_tokens": 126251447.0, "step": 282 }, { "epoch": 2.777367773677737, "grad_norm": 0.07018116547650956, "learning_rate": 4.957584938851048e-06, "loss": 0.0302, "num_tokens": 126702532.0, "step": 283 }, { "epoch": 2.787207872078721, "grad_norm": 0.0779053076124061, "learning_rate": 4.929187007565996e-06, "loss": 0.0135, "num_tokens": 127176249.0, "step": 284 }, { "epoch": 2.7970479704797047, "grad_norm": 0.05851676194024864, "learning_rate": 4.9008121617250425e-06, "loss": 0.0119, "num_tokens": 127624082.0, "step": 285 }, { "epoch": 2.8068880688806885, "grad_norm": 0.06198574153406469, "learning_rate": 4.87246154889477e-06, "loss": 0.0122, "num_tokens": 128044479.0, "step": 286 }, { "epoch": 2.816728167281673, "grad_norm": 0.06543548304815791, "learning_rate": 4.8441363156617085e-06, "loss": 0.0116, "num_tokens": 128480925.0, "step": 287 }, { "epoch": 2.8265682656826567, "grad_norm": 0.06877942595907044, "learning_rate": 4.815837607585957e-06, "loss": 0.0129, "num_tokens": 128924136.0, "step": 288 }, { "epoch": 2.836408364083641, "grad_norm": 0.07130567586435052, "learning_rate": 4.787566569154855e-06, "loss": 0.014, "num_tokens": 129349471.0, "step": 289 }, { "epoch": 2.846248462484625, "grad_norm": 0.059470596149140166, "learning_rate": 4.759324343736698e-06, "loss": 0.0114, "num_tokens": 129789536.0, "step": 290 }, { "epoch": 2.856088560885609, "grad_norm": 0.06813104970883538, "learning_rate": 4.731112073534491e-06, "loss": 0.0134, "num_tokens": 130231279.0, "step": 291 }, { "epoch": 2.8659286592865927, "grad_norm": 0.44224478447995313, "learning_rate": 4.70293089953976e-06, "loss": 0.2098, "num_tokens": 130687907.0, "step": 292 }, { "epoch": 2.875768757687577, "grad_norm": 0.06521058331073082, "learning_rate": 4.674781961486399e-06, "loss": 0.0121, "num_tokens": 131139639.0, "step": 293 }, { "epoch": 2.885608856088561, "grad_norm": 0.0669166944603376, "learning_rate": 4.646666397804586e-06, "loss": 0.0115, "num_tokens": 131558019.0, "step": 294 }, { "epoch": 2.8954489544895448, "grad_norm": 0.05844491203018172, "learning_rate": 4.618585345574741e-06, "loss": 0.0116, "num_tokens": 132015109.0, "step": 295 }, { "epoch": 2.905289052890529, "grad_norm": 0.06568591959992806, "learning_rate": 4.5905399404815196e-06, "loss": 0.0139, "num_tokens": 132473250.0, "step": 296 }, { "epoch": 2.915129151291513, "grad_norm": 0.0740965654131703, "learning_rate": 4.562531316767908e-06, "loss": 0.0142, "num_tokens": 132924223.0, "step": 297 }, { "epoch": 2.924969249692497, "grad_norm": 0.06249511498433694, "learning_rate": 4.534560607189338e-06, "loss": 0.0116, "num_tokens": 133401598.0, "step": 298 }, { "epoch": 2.9348093480934807, "grad_norm": 0.06283720689157825, "learning_rate": 4.506628942967874e-06, "loss": 0.0111, "num_tokens": 133858797.0, "step": 299 }, { "epoch": 2.944649446494465, "grad_norm": 0.056149678938915315, "learning_rate": 4.478737453746464e-06, "loss": 0.0104, "num_tokens": 134308841.0, "step": 300 }, { "epoch": 2.954489544895449, "grad_norm": 0.09917979372278035, "learning_rate": 4.450887267543261e-06, "loss": 0.016, "num_tokens": 134737382.0, "step": 301 }, { "epoch": 2.9643296432964332, "grad_norm": 0.0640380581075781, "learning_rate": 4.423079510705992e-06, "loss": 0.0122, "num_tokens": 135161876.0, "step": 302 }, { "epoch": 2.974169741697417, "grad_norm": 0.06467022053195733, "learning_rate": 4.395315307866404e-06, "loss": 0.0165, "num_tokens": 135642143.0, "step": 303 }, { "epoch": 2.984009840098401, "grad_norm": 0.06355259176583768, "learning_rate": 4.3675957818947965e-06, "loss": 0.0122, "num_tokens": 136085866.0, "step": 304 }, { "epoch": 2.993849938499385, "grad_norm": 0.06019771768539855, "learning_rate": 4.33992205385459e-06, "loss": 0.0192, "num_tokens": 136558605.0, "step": 305 }, { "epoch": 3.0, "grad_norm": 0.06019771768539855, "learning_rate": 4.312295242956998e-06, "loss": 0.0196, "num_tokens": 136769120.0, "step": 306 }, { "epoch": 3.009840098400984, "grad_norm": 0.09281584655295379, "learning_rate": 4.284716466515759e-06, "loss": 0.015, "num_tokens": 137221619.0, "step": 307 }, { "epoch": 3.019680196801968, "grad_norm": 0.06328942540239005, "learning_rate": 4.257186839901948e-06, "loss": 0.0242, "num_tokens": 137668031.0, "step": 308 }, { "epoch": 3.029520295202952, "grad_norm": 0.1013363943890701, "learning_rate": 4.229707476498871e-06, "loss": 0.0098, "num_tokens": 138123076.0, "step": 309 }, { "epoch": 3.039360393603936, "grad_norm": 0.05368561892248246, "learning_rate": 4.2022794876570335e-06, "loss": 0.0092, "num_tokens": 138584469.0, "step": 310 }, { "epoch": 3.0492004920049203, "grad_norm": 0.055594058845563385, "learning_rate": 4.1749039826491956e-06, "loss": 0.0097, "num_tokens": 139040295.0, "step": 311 }, { "epoch": 3.059040590405904, "grad_norm": 0.061365365973882896, "learning_rate": 4.1475820686255055e-06, "loss": 0.0112, "num_tokens": 139467398.0, "step": 312 }, { "epoch": 3.068880688806888, "grad_norm": 0.07576400254078287, "learning_rate": 4.120314850568731e-06, "loss": 0.0112, "num_tokens": 139915959.0, "step": 313 }, { "epoch": 3.078720787207872, "grad_norm": 0.06609887375125617, "learning_rate": 4.093103431249563e-06, "loss": 0.0093, "num_tokens": 140383093.0, "step": 314 }, { "epoch": 3.088560885608856, "grad_norm": 0.06075307137850355, "learning_rate": 4.065948911182015e-06, "loss": 0.0099, "num_tokens": 140822538.0, "step": 315 }, { "epoch": 3.09840098400984, "grad_norm": 0.062126435434214644, "learning_rate": 4.038852388578925e-06, "loss": 0.0099, "num_tokens": 141294868.0, "step": 316 }, { "epoch": 3.108241082410824, "grad_norm": 0.06817279237293263, "learning_rate": 4.011814959307533e-06, "loss": 0.0168, "num_tokens": 141736810.0, "step": 317 }, { "epoch": 3.1180811808118083, "grad_norm": 0.06959931879800538, "learning_rate": 3.984837716845157e-06, "loss": 0.0098, "num_tokens": 142184603.0, "step": 318 }, { "epoch": 3.127921279212792, "grad_norm": 0.06353490194683095, "learning_rate": 3.957921752234982e-06, "loss": 0.0167, "num_tokens": 142622524.0, "step": 319 }, { "epoch": 3.137761377613776, "grad_norm": 0.06947695018548046, "learning_rate": 3.931068154041919e-06, "loss": 0.0097, "num_tokens": 143083493.0, "step": 320 }, { "epoch": 3.14760147601476, "grad_norm": 0.059043509631747315, "learning_rate": 3.904278008308589e-06, "loss": 0.0104, "num_tokens": 143533589.0, "step": 321 }, { "epoch": 3.1574415744157442, "grad_norm": 0.06496286878562947, "learning_rate": 3.877552398511409e-06, "loss": 0.0096, "num_tokens": 143992909.0, "step": 322 }, { "epoch": 3.167281672816728, "grad_norm": 0.05772743455179762, "learning_rate": 3.85089240551675e-06, "loss": 0.0116, "num_tokens": 144432232.0, "step": 323 }, { "epoch": 3.177121771217712, "grad_norm": 0.06067527859066343, "learning_rate": 3.8242991075372436e-06, "loss": 0.0095, "num_tokens": 144872986.0, "step": 324 }, { "epoch": 3.1869618696186963, "grad_norm": 0.06092970409513084, "learning_rate": 3.7977735800881687e-06, "loss": 0.0096, "num_tokens": 145338330.0, "step": 325 }, { "epoch": 3.19680196801968, "grad_norm": 0.0599181858398082, "learning_rate": 3.7713168959439515e-06, "loss": 0.0105, "num_tokens": 145779615.0, "step": 326 }, { "epoch": 3.206642066420664, "grad_norm": 0.06222020359396952, "learning_rate": 3.74493012509478e-06, "loss": 0.0111, "num_tokens": 146242526.0, "step": 327 }, { "epoch": 3.2164821648216484, "grad_norm": 0.06718711950500762, "learning_rate": 3.718614334703339e-06, "loss": 0.011, "num_tokens": 146706718.0, "step": 328 }, { "epoch": 3.2263222632226323, "grad_norm": 0.06582492337813067, "learning_rate": 3.692370589061639e-06, "loss": 0.0176, "num_tokens": 147136544.0, "step": 329 }, { "epoch": 3.236162361623616, "grad_norm": 0.11649214601244208, "learning_rate": 3.6661999495479772e-06, "loss": 0.0143, "num_tokens": 147581972.0, "step": 330 }, { "epoch": 3.2460024600246005, "grad_norm": 0.07623567009845389, "learning_rate": 3.640103474584016e-06, "loss": 0.0109, "num_tokens": 148046352.0, "step": 331 }, { "epoch": 3.2558425584255843, "grad_norm": 0.10991461635206767, "learning_rate": 3.614082219591972e-06, "loss": 0.0125, "num_tokens": 148512074.0, "step": 332 }, { "epoch": 3.265682656826568, "grad_norm": 0.0553413140200664, "learning_rate": 3.588137236951934e-06, "loss": 0.0112, "num_tokens": 148974289.0, "step": 333 }, { "epoch": 3.275522755227552, "grad_norm": 0.0647893470114748, "learning_rate": 3.5622695759592996e-06, "loss": 0.0093, "num_tokens": 149432821.0, "step": 334 }, { "epoch": 3.2853628536285364, "grad_norm": 0.05909492699311915, "learning_rate": 3.5364802827823397e-06, "loss": 0.0095, "num_tokens": 149885306.0, "step": 335 }, { "epoch": 3.2952029520295203, "grad_norm": 0.06091784855812255, "learning_rate": 3.5107704004198904e-06, "loss": 0.012, "num_tokens": 150329823.0, "step": 336 }, { "epoch": 3.305043050430504, "grad_norm": 0.0653285630713288, "learning_rate": 3.485140968659166e-06, "loss": 0.0092, "num_tokens": 150762193.0, "step": 337 }, { "epoch": 3.3148831488314885, "grad_norm": 0.059323407369363576, "learning_rate": 3.4595930240337115e-06, "loss": 0.0095, "num_tokens": 151238234.0, "step": 338 }, { "epoch": 3.3247232472324724, "grad_norm": 0.057695587877706345, "learning_rate": 3.4341275997814795e-06, "loss": 0.0088, "num_tokens": 151665989.0, "step": 339 }, { "epoch": 3.3345633456334562, "grad_norm": 0.05732212076245395, "learning_rate": 3.408745725803042e-06, "loss": 0.0098, "num_tokens": 152102577.0, "step": 340 }, { "epoch": 3.34440344403444, "grad_norm": 0.0595487114131121, "learning_rate": 3.383448428619941e-06, "loss": 0.0097, "num_tokens": 152531136.0, "step": 341 }, { "epoch": 3.3542435424354244, "grad_norm": 0.06136757636985544, "learning_rate": 3.3582367313331692e-06, "loss": 0.0096, "num_tokens": 152996747.0, "step": 342 }, { "epoch": 3.3640836408364083, "grad_norm": 0.05958558106516815, "learning_rate": 3.3331116535817974e-06, "loss": 0.0095, "num_tokens": 153435759.0, "step": 343 }, { "epoch": 3.373923739237392, "grad_norm": 0.05874672952511224, "learning_rate": 3.308074211501732e-06, "loss": 0.0095, "num_tokens": 153870082.0, "step": 344 }, { "epoch": 3.3837638376383765, "grad_norm": 0.06528615293660638, "learning_rate": 3.2831254176846205e-06, "loss": 0.0099, "num_tokens": 154317864.0, "step": 345 }, { "epoch": 3.3936039360393604, "grad_norm": 0.06940648713659847, "learning_rate": 3.258266281136905e-06, "loss": 0.0152, "num_tokens": 154756769.0, "step": 346 }, { "epoch": 3.4034440344403443, "grad_norm": 0.061886970984610115, "learning_rate": 3.233497807239008e-06, "loss": 0.0097, "num_tokens": 155205894.0, "step": 347 }, { "epoch": 3.4132841328413286, "grad_norm": 0.06025802060896222, "learning_rate": 3.2088209977046657e-06, "loss": 0.0104, "num_tokens": 155650235.0, "step": 348 }, { "epoch": 3.4231242312423125, "grad_norm": 0.079505096478584, "learning_rate": 3.1842368505404388e-06, "loss": 0.0093, "num_tokens": 156097609.0, "step": 349 }, { "epoch": 3.4329643296432963, "grad_norm": 0.05748478767611536, "learning_rate": 3.1597463600053258e-06, "loss": 0.0099, "num_tokens": 156538234.0, "step": 350 }, { "epoch": 3.4428044280442807, "grad_norm": 0.06024044990886995, "learning_rate": 3.135350516570559e-06, "loss": 0.0089, "num_tokens": 156983361.0, "step": 351 }, { "epoch": 3.4526445264452645, "grad_norm": 0.05743125939934545, "learning_rate": 3.111050306879556e-06, "loss": 0.0092, "num_tokens": 157418360.0, "step": 352 }, { "epoch": 3.4624846248462484, "grad_norm": 0.05732681074233502, "learning_rate": 3.0868467137080075e-06, "loss": 0.0095, "num_tokens": 157832487.0, "step": 353 }, { "epoch": 3.4723247232472323, "grad_norm": 0.05983498966527194, "learning_rate": 3.0627407159241273e-06, "loss": 0.0134, "num_tokens": 158300222.0, "step": 354 }, { "epoch": 3.4821648216482166, "grad_norm": 0.11937622237492422, "learning_rate": 3.0387332884490806e-06, "loss": 0.0101, "num_tokens": 158757341.0, "step": 355 }, { "epoch": 3.4920049200492005, "grad_norm": 0.06131645683210505, "learning_rate": 3.014825402217533e-06, "loss": 0.0162, "num_tokens": 159204107.0, "step": 356 }, { "epoch": 3.5018450184501844, "grad_norm": 0.07487264837463524, "learning_rate": 2.9910180241384014e-06, "loss": 0.0094, "num_tokens": 159649846.0, "step": 357 }, { "epoch": 3.5116851168511687, "grad_norm": 0.05978234516994627, "learning_rate": 2.9673121170557396e-06, "loss": 0.0093, "num_tokens": 160075112.0, "step": 358 }, { "epoch": 3.5215252152521526, "grad_norm": 0.05949296353246161, "learning_rate": 2.9437086397097996e-06, "loss": 0.0654, "num_tokens": 160529018.0, "step": 359 }, { "epoch": 3.5313653136531364, "grad_norm": 0.49227073434692264, "learning_rate": 2.92020854669826e-06, "loss": 0.0099, "num_tokens": 160983786.0, "step": 360 }, { "epoch": 3.5412054120541203, "grad_norm": 0.06622940201032448, "learning_rate": 2.896812788437615e-06, "loss": 0.01, "num_tokens": 161416791.0, "step": 361 }, { "epoch": 3.5510455104551046, "grad_norm": 0.06226773442108338, "learning_rate": 2.8735223111247402e-06, "loss": 0.0086, "num_tokens": 161878966.0, "step": 362 }, { "epoch": 3.5608856088560885, "grad_norm": 0.052597494365304226, "learning_rate": 2.850338056698621e-06, "loss": 0.0105, "num_tokens": 162327927.0, "step": 363 }, { "epoch": 3.570725707257073, "grad_norm": 0.07012270383857575, "learning_rate": 2.827260962802263e-06, "loss": 0.0147, "num_tokens": 162786081.0, "step": 364 }, { "epoch": 3.5805658056580567, "grad_norm": 0.055063275741049224, "learning_rate": 2.804291962744768e-06, "loss": 0.0094, "num_tokens": 163225787.0, "step": 365 }, { "epoch": 3.5904059040590406, "grad_norm": 0.05925152600247852, "learning_rate": 2.7814319854635875e-06, "loss": 0.0103, "num_tokens": 163657501.0, "step": 366 }, { "epoch": 3.6002460024600245, "grad_norm": 0.06722393012249571, "learning_rate": 2.758681955486955e-06, "loss": 0.0144, "num_tokens": 164141801.0, "step": 367 }, { "epoch": 3.6100861008610083, "grad_norm": 0.06258536118771861, "learning_rate": 2.736042792896495e-06, "loss": 0.0095, "num_tokens": 164583522.0, "step": 368 }, { "epoch": 3.6199261992619927, "grad_norm": 0.058745565835135405, "learning_rate": 2.7135154132900133e-06, "loss": 0.2081, "num_tokens": 165059903.0, "step": 369 }, { "epoch": 3.6297662976629765, "grad_norm": 0.45572028974101275, "learning_rate": 2.691100727744458e-06, "loss": 0.0093, "num_tokens": 165501531.0, "step": 370 }, { "epoch": 3.639606396063961, "grad_norm": 0.06413891247366597, "learning_rate": 2.668799642779093e-06, "loss": 0.0092, "num_tokens": 165943767.0, "step": 371 }, { "epoch": 3.6494464944649447, "grad_norm": 0.06190519506649364, "learning_rate": 2.6466130603188157e-06, "loss": 0.0093, "num_tokens": 166367316.0, "step": 372 }, { "epoch": 3.6592865928659286, "grad_norm": 0.05986849692187042, "learning_rate": 2.624541877657685e-06, "loss": 0.0089, "num_tokens": 166834863.0, "step": 373 }, { "epoch": 3.6691266912669125, "grad_norm": 0.05660848820396227, "learning_rate": 2.602586987422643e-06, "loss": 0.0091, "num_tokens": 167276182.0, "step": 374 }, { "epoch": 3.678966789667897, "grad_norm": 0.06018671949798086, "learning_rate": 2.580749277537399e-06, "loss": 0.0089, "num_tokens": 167713298.0, "step": 375 }, { "epoch": 3.6888068880688807, "grad_norm": 0.05866547399139435, "learning_rate": 2.5590296311865294e-06, "loss": 0.0089, "num_tokens": 168174626.0, "step": 376 }, { "epoch": 3.6986469864698646, "grad_norm": 0.05484667079392053, "learning_rate": 2.537428926779758e-06, "loss": 0.0084, "num_tokens": 168614618.0, "step": 377 }, { "epoch": 3.708487084870849, "grad_norm": 0.05383142631084436, "learning_rate": 2.515948037916423e-06, "loss": 0.0093, "num_tokens": 169020836.0, "step": 378 }, { "epoch": 3.7183271832718328, "grad_norm": 0.06002011881446048, "learning_rate": 2.494587833350153e-06, "loss": 0.0087, "num_tokens": 169486382.0, "step": 379 }, { "epoch": 3.7281672816728166, "grad_norm": 0.05744381429509693, "learning_rate": 2.473349176953736e-06, "loss": 0.0095, "num_tokens": 169927283.0, "step": 380 }, { "epoch": 3.7380073800738005, "grad_norm": 0.060208708524887654, "learning_rate": 2.4522329276841664e-06, "loss": 0.0199, "num_tokens": 170406677.0, "step": 381 }, { "epoch": 3.747847478474785, "grad_norm": 0.07568634621301189, "learning_rate": 2.431239939547921e-06, "loss": 0.0091, "num_tokens": 170848564.0, "step": 382 }, { "epoch": 3.7576875768757687, "grad_norm": 0.05568928633830086, "learning_rate": 2.4103710615664145e-06, "loss": 0.0087, "num_tokens": 171300323.0, "step": 383 }, { "epoch": 3.767527675276753, "grad_norm": 0.05797358351819658, "learning_rate": 2.389627137741662e-06, "loss": 0.009, "num_tokens": 171756442.0, "step": 384 }, { "epoch": 3.777367773677737, "grad_norm": 0.05921869424481757, "learning_rate": 2.369009007022146e-06, "loss": 0.0093, "num_tokens": 172205755.0, "step": 385 }, { "epoch": 3.787207872078721, "grad_norm": 0.05977666911207599, "learning_rate": 2.3485175032688865e-06, "loss": 0.0093, "num_tokens": 172657182.0, "step": 386 }, { "epoch": 3.7970479704797047, "grad_norm": 0.05932546877174744, "learning_rate": 2.328153455221717e-06, "loss": 0.0093, "num_tokens": 173095889.0, "step": 387 }, { "epoch": 3.8068880688806885, "grad_norm": 0.06042663443540837, "learning_rate": 2.3079176864657673e-06, "loss": 0.0126, "num_tokens": 173547333.0, "step": 388 }, { "epoch": 3.816728167281673, "grad_norm": 0.15319976378374434, "learning_rate": 2.2878110153981565e-06, "loss": 0.0127, "num_tokens": 174010201.0, "step": 389 }, { "epoch": 3.8265682656826567, "grad_norm": 0.06053526576608028, "learning_rate": 2.267834255194894e-06, "loss": 0.0091, "num_tokens": 174451094.0, "step": 390 }, { "epoch": 3.836408364083641, "grad_norm": 0.056105090205726134, "learning_rate": 2.2479882137779903e-06, "loss": 0.0086, "num_tokens": 174912611.0, "step": 391 }, { "epoch": 3.846248462484625, "grad_norm": 0.05451931809215225, "learning_rate": 2.228273693782784e-06, "loss": 0.181, "num_tokens": 175406884.0, "step": 392 }, { "epoch": 3.856088560885609, "grad_norm": 0.21723127868567568, "learning_rate": 2.208691492525481e-06, "loss": 0.0201, "num_tokens": 175906140.0, "step": 393 }, { "epoch": 3.8659286592865927, "grad_norm": 0.0674311563870576, "learning_rate": 2.189242401970908e-06, "loss": 0.0109, "num_tokens": 176321003.0, "step": 394 }, { "epoch": 3.875768757687577, "grad_norm": 0.06352093720920256, "learning_rate": 2.169927208700482e-06, "loss": 0.0105, "num_tokens": 176774697.0, "step": 395 }, { "epoch": 3.885608856088561, "grad_norm": 0.07829494568911512, "learning_rate": 2.1507466938804013e-06, "loss": 0.009, "num_tokens": 177239096.0, "step": 396 }, { "epoch": 3.8954489544895448, "grad_norm": 0.05295977087497385, "learning_rate": 2.131701633230045e-06, "loss": 0.0108, "num_tokens": 177705420.0, "step": 397 }, { "epoch": 3.905289052890529, "grad_norm": 0.06253022869487425, "learning_rate": 2.112792796990616e-06, "loss": 0.011, "num_tokens": 178159440.0, "step": 398 }, { "epoch": 3.915129151291513, "grad_norm": 0.06311743420915392, "learning_rate": 2.0940209498939732e-06, "loss": 0.0087, "num_tokens": 178600759.0, "step": 399 }, { "epoch": 3.924969249692497, "grad_norm": 0.05932091295852782, "learning_rate": 2.075386851131711e-06, "loss": 0.022, "num_tokens": 179053675.0, "step": 400 }, { "epoch": 3.9348093480934807, "grad_norm": 0.07035371257735856, "learning_rate": 2.056891254324459e-06, "loss": 0.0099, "num_tokens": 179505855.0, "step": 401 }, { "epoch": 3.944649446494465, "grad_norm": 0.06484414999103219, "learning_rate": 2.038534907491396e-06, "loss": 0.0088, "num_tokens": 179936013.0, "step": 402 }, { "epoch": 3.954489544895449, "grad_norm": 0.060396196023032395, "learning_rate": 2.0203185530199983e-06, "loss": 0.0088, "num_tokens": 180382727.0, "step": 403 }, { "epoch": 3.9643296432964332, "grad_norm": 0.05481446314958498, "learning_rate": 2.0022429276360256e-06, "loss": 0.0083, "num_tokens": 180829894.0, "step": 404 }, { "epoch": 3.974169741697417, "grad_norm": 0.05579653515238233, "learning_rate": 1.9843087623737097e-06, "loss": 0.0084, "num_tokens": 181285317.0, "step": 405 }, { "epoch": 3.984009840098401, "grad_norm": 0.08234263765839406, "learning_rate": 1.966516782546199e-06, "loss": 0.0147, "num_tokens": 181726465.0, "step": 406 }, { "epoch": 3.993849938499385, "grad_norm": 0.061989197697927254, "learning_rate": 1.94886770771623e-06, "loss": 0.021, "num_tokens": 182137592.0, "step": 407 }, { "epoch": 4.0, "grad_norm": 0.06561678793095134, "learning_rate": 1.931362251667008e-06, "loss": 0.01, "num_tokens": 182350141.0, "step": 408 }, { "epoch": 4.009840098400984, "grad_norm": 0.08249843123562729, "learning_rate": 1.9140011223733576e-06, "loss": 0.0077, "num_tokens": 182808332.0, "step": 409 }, { "epoch": 4.019680196801968, "grad_norm": 0.051015416231373206, "learning_rate": 1.8967850219730799e-06, "loss": 0.0081, "num_tokens": 183234541.0, "step": 410 }, { "epoch": 4.029520295202952, "grad_norm": 0.05624434271690936, "learning_rate": 1.8797146467385604e-06, "loss": 0.008, "num_tokens": 183657861.0, "step": 411 }, { "epoch": 4.039360393603936, "grad_norm": 0.055439304665308706, "learning_rate": 1.8627906870486063e-06, "loss": 0.0076, "num_tokens": 184114899.0, "step": 412 }, { "epoch": 4.04920049200492, "grad_norm": 0.053200752263272366, "learning_rate": 1.8460138273605265e-06, "loss": 0.0075, "num_tokens": 184540917.0, "step": 413 }, { "epoch": 4.059040590405904, "grad_norm": 0.05493004000264093, "learning_rate": 1.8293847461824538e-06, "loss": 0.0182, "num_tokens": 184993388.0, "step": 414 }, { "epoch": 4.068880688806888, "grad_norm": 0.08372361370144696, "learning_rate": 1.8129041160458966e-06, "loss": 0.0136, "num_tokens": 185443466.0, "step": 415 }, { "epoch": 4.078720787207872, "grad_norm": 0.05966232822391394, "learning_rate": 1.7965726034785466e-06, "loss": 0.0081, "num_tokens": 185884446.0, "step": 416 }, { "epoch": 4.088560885608856, "grad_norm": 0.05964634351715403, "learning_rate": 1.780390868977318e-06, "loss": 0.0109, "num_tokens": 186324584.0, "step": 417 }, { "epoch": 4.0984009840098405, "grad_norm": 0.077920426753446, "learning_rate": 1.7643595669816378e-06, "loss": 0.0125, "num_tokens": 186805249.0, "step": 418 }, { "epoch": 4.108241082410824, "grad_norm": 0.052268159950947905, "learning_rate": 1.7484793458469745e-06, "loss": 0.0075, "num_tokens": 187242236.0, "step": 419 }, { "epoch": 4.118081180811808, "grad_norm": 0.055826923432114865, "learning_rate": 1.7327508478186216e-06, "loss": 0.0072, "num_tokens": 187710452.0, "step": 420 }, { "epoch": 4.127921279212792, "grad_norm": 0.05075579193844761, "learning_rate": 1.7171747090057201e-06, "loss": 0.0073, "num_tokens": 188166192.0, "step": 421 }, { "epoch": 4.137761377613776, "grad_norm": 0.05526562542826341, "learning_rate": 1.7017515593555295e-06, "loss": 0.0075, "num_tokens": 188609825.0, "step": 422 }, { "epoch": 4.14760147601476, "grad_norm": 0.05657081985879626, "learning_rate": 1.6864820226279607e-06, "loss": 0.0074, "num_tokens": 189068636.0, "step": 423 }, { "epoch": 4.157441574415744, "grad_norm": 0.05470927216680097, "learning_rate": 1.6713667163703348e-06, "loss": 0.008, "num_tokens": 189501962.0, "step": 424 }, { "epoch": 4.167281672816729, "grad_norm": 0.06396161695980077, "learning_rate": 1.6564062518924202e-06, "loss": 0.0134, "num_tokens": 189933564.0, "step": 425 }, { "epoch": 4.177121771217712, "grad_norm": 0.06708045538135683, "learning_rate": 1.6416012342417056e-06, "loss": 0.013, "num_tokens": 190376512.0, "step": 426 }, { "epoch": 4.186961869618696, "grad_norm": 0.06902938106259363, "learning_rate": 1.6269522621789246e-06, "loss": 0.0185, "num_tokens": 190824286.0, "step": 427 }, { "epoch": 4.19680196801968, "grad_norm": 0.06764173670337073, "learning_rate": 1.6124599281538452e-06, "loss": 0.023, "num_tokens": 191293501.0, "step": 428 }, { "epoch": 4.206642066420664, "grad_norm": 0.055114986132894145, "learning_rate": 1.5981248182813136e-06, "loss": 0.0073, "num_tokens": 191738984.0, "step": 429 }, { "epoch": 4.216482164821648, "grad_norm": 0.051261855942856496, "learning_rate": 1.583947512317537e-06, "loss": 0.0075, "num_tokens": 192188472.0, "step": 430 }, { "epoch": 4.226322263222632, "grad_norm": 0.05658108779272827, "learning_rate": 1.5699285836366488e-06, "loss": 0.0077, "num_tokens": 192610188.0, "step": 431 }, { "epoch": 4.236162361623617, "grad_norm": 0.05505791232716452, "learning_rate": 1.5560685992075141e-06, "loss": 0.0074, "num_tokens": 193063967.0, "step": 432 }, { "epoch": 4.2460024600246005, "grad_norm": 0.054672853412556996, "learning_rate": 1.5423681195707997e-06, "loss": 0.0072, "num_tokens": 193546556.0, "step": 433 }, { "epoch": 4.255842558425584, "grad_norm": 0.05581920906854521, "learning_rate": 1.528827698816306e-06, "loss": 0.0077, "num_tokens": 193976678.0, "step": 434 }, { "epoch": 4.265682656826568, "grad_norm": 0.05717281375700824, "learning_rate": 1.515447884560556e-06, "loss": 0.0074, "num_tokens": 194451959.0, "step": 435 }, { "epoch": 4.275522755227552, "grad_norm": 0.05353729617839062, "learning_rate": 1.502229217924649e-06, "loss": 0.0075, "num_tokens": 194919769.0, "step": 436 }, { "epoch": 4.285362853628536, "grad_norm": 0.05747591456155956, "learning_rate": 1.489172233512376e-06, "loss": 0.0073, "num_tokens": 195385061.0, "step": 437 }, { "epoch": 4.29520295202952, "grad_norm": 0.0605171649231424, "learning_rate": 1.4762774593885986e-06, "loss": 0.008, "num_tokens": 195829302.0, "step": 438 }, { "epoch": 4.305043050430505, "grad_norm": 0.057478821326489744, "learning_rate": 1.4635454170578917e-06, "loss": 0.0082, "num_tokens": 196272086.0, "step": 439 }, { "epoch": 4.3148831488314885, "grad_norm": 0.05677684124604302, "learning_rate": 1.4509766214434535e-06, "loss": 0.0074, "num_tokens": 196705376.0, "step": 440 }, { "epoch": 4.324723247232472, "grad_norm": 0.05308761925199772, "learning_rate": 1.4385715808662787e-06, "loss": 0.0077, "num_tokens": 197144202.0, "step": 441 }, { "epoch": 4.334563345633456, "grad_norm": 0.0537126730210605, "learning_rate": 1.4263307970246027e-06, "loss": 0.0072, "num_tokens": 197583484.0, "step": 442 }, { "epoch": 4.34440344403444, "grad_norm": 0.0581676639275268, "learning_rate": 1.41425476497361e-06, "loss": 0.01, "num_tokens": 198038237.0, "step": 443 }, { "epoch": 4.354243542435424, "grad_norm": 0.05692004762795615, "learning_rate": 1.4023439731054112e-06, "loss": 0.0076, "num_tokens": 198478749.0, "step": 444 }, { "epoch": 4.364083640836409, "grad_norm": 0.05533945838827064, "learning_rate": 1.390598903129296e-06, "loss": 0.0075, "num_tokens": 198919863.0, "step": 445 }, { "epoch": 4.373923739237393, "grad_norm": 0.08520786010753936, "learning_rate": 1.3790200300522413e-06, "loss": 0.021, "num_tokens": 199382929.0, "step": 446 }, { "epoch": 4.3837638376383765, "grad_norm": 0.05139822843194828, "learning_rate": 1.3676078221597157e-06, "loss": 0.0075, "num_tokens": 199843263.0, "step": 447 }, { "epoch": 4.39360393603936, "grad_norm": 0.31624336801190883, "learning_rate": 1.3563627409967257e-06, "loss": 0.0554, "num_tokens": 200287378.0, "step": 448 }, { "epoch": 4.403444034440344, "grad_norm": 0.05264132661612972, "learning_rate": 1.3452852413491563e-06, "loss": 0.0065, "num_tokens": 200793892.0, "step": 449 }, { "epoch": 4.413284132841328, "grad_norm": 0.05900297532446137, "learning_rate": 1.3343757712253804e-06, "loss": 0.0093, "num_tokens": 201237298.0, "step": 450 }, { "epoch": 4.423124231242312, "grad_norm": 0.28018993976510653, "learning_rate": 1.3236347718381338e-06, "loss": 0.1846, "num_tokens": 201713169.0, "step": 451 }, { "epoch": 4.432964329643297, "grad_norm": 0.056807303328480156, "learning_rate": 1.3130626775866743e-06, "loss": 0.0075, "num_tokens": 202186125.0, "step": 452 }, { "epoch": 4.442804428044281, "grad_norm": 0.050176964423526066, "learning_rate": 1.3026599160392173e-06, "loss": 0.0069, "num_tokens": 202642625.0, "step": 453 }, { "epoch": 4.4526445264452645, "grad_norm": 0.09261733517736204, "learning_rate": 1.292426907915634e-06, "loss": 0.0116, "num_tokens": 203084429.0, "step": 454 }, { "epoch": 4.462484624846248, "grad_norm": 0.05582692526106092, "learning_rate": 1.2823640670704443e-06, "loss": 0.0074, "num_tokens": 203531332.0, "step": 455 }, { "epoch": 4.472324723247232, "grad_norm": 0.0544386302493359, "learning_rate": 1.2724718004760794e-06, "loss": 0.007, "num_tokens": 203977923.0, "step": 456 }, { "epoch": 4.482164821648216, "grad_norm": 0.0485924079874789, "learning_rate": 1.2627505082064144e-06, "loss": 0.0075, "num_tokens": 204447835.0, "step": 457 }, { "epoch": 4.492004920049201, "grad_norm": 0.05314912137402164, "learning_rate": 1.2532005834205976e-06, "loss": 0.0077, "num_tokens": 204889787.0, "step": 458 }, { "epoch": 4.501845018450185, "grad_norm": 0.059487474238797314, "learning_rate": 1.2438224123471442e-06, "loss": 0.0081, "num_tokens": 205319753.0, "step": 459 }, { "epoch": 4.511685116851169, "grad_norm": 0.053210906319883816, "learning_rate": 1.2346163742683185e-06, "loss": 0.0075, "num_tokens": 205765375.0, "step": 460 }, { "epoch": 4.521525215252153, "grad_norm": 0.052123105650746834, "learning_rate": 1.2255828415047932e-06, "loss": 0.0215, "num_tokens": 206223886.0, "step": 461 }, { "epoch": 4.531365313653136, "grad_norm": 0.058022484093730155, "learning_rate": 1.216722179400592e-06, "loss": 0.0079, "num_tokens": 206657932.0, "step": 462 }, { "epoch": 4.54120541205412, "grad_norm": 0.06046936274806646, "learning_rate": 1.208034746308315e-06, "loss": 0.0119, "num_tokens": 207113562.0, "step": 463 }, { "epoch": 4.551045510455104, "grad_norm": 0.05202796714609154, "learning_rate": 1.1995208935746437e-06, "loss": 0.0073, "num_tokens": 207577701.0, "step": 464 }, { "epoch": 4.560885608856088, "grad_norm": 0.053010352876086804, "learning_rate": 1.1911809655261333e-06, "loss": 0.0071, "num_tokens": 208021678.0, "step": 465 }, { "epoch": 4.570725707257073, "grad_norm": 0.05123561999347148, "learning_rate": 1.1830152994552866e-06, "loss": 0.0072, "num_tokens": 208467211.0, "step": 466 }, { "epoch": 4.580565805658057, "grad_norm": 0.06003956537674327, "learning_rate": 1.175024225606912e-06, "loss": 0.0075, "num_tokens": 208909702.0, "step": 467 }, { "epoch": 4.590405904059041, "grad_norm": 0.056238323571208206, "learning_rate": 1.1672080671647695e-06, "loss": 0.0076, "num_tokens": 209342529.0, "step": 468 }, { "epoch": 4.6002460024600245, "grad_norm": 0.0552115239217723, "learning_rate": 1.1595671402384966e-06, "loss": 0.0112, "num_tokens": 209805083.0, "step": 469 }, { "epoch": 4.610086100861008, "grad_norm": 0.07199003274369836, "learning_rate": 1.152101753850828e-06, "loss": 0.0077, "num_tokens": 210267459.0, "step": 470 }, { "epoch": 4.619926199261993, "grad_norm": 0.050805856130265635, "learning_rate": 1.1448122099250946e-06, "loss": 0.0076, "num_tokens": 210732019.0, "step": 471 }, { "epoch": 4.629766297662977, "grad_norm": 0.054524548819372444, "learning_rate": 1.1376988032730135e-06, "loss": 0.0082, "num_tokens": 211158328.0, "step": 472 }, { "epoch": 4.639606396063961, "grad_norm": 0.061253234718759104, "learning_rate": 1.130761821582766e-06, "loss": 0.0068, "num_tokens": 211615451.0, "step": 473 }, { "epoch": 4.649446494464945, "grad_norm": 0.054438039734835346, "learning_rate": 1.1240015454073622e-06, "loss": 0.0073, "num_tokens": 212052321.0, "step": 474 }, { "epoch": 4.659286592865929, "grad_norm": 0.05428969184748025, "learning_rate": 1.1174182481532943e-06, "loss": 0.0085, "num_tokens": 212511116.0, "step": 475 }, { "epoch": 4.6691266912669125, "grad_norm": 0.06405292334946296, "learning_rate": 1.1110121960694773e-06, "loss": 0.015, "num_tokens": 212943028.0, "step": 476 }, { "epoch": 4.678966789667896, "grad_norm": 0.05682101787290208, "learning_rate": 1.104783648236486e-06, "loss": 0.0175, "num_tokens": 213395996.0, "step": 477 }, { "epoch": 4.68880688806888, "grad_norm": 0.05953289126520201, "learning_rate": 1.0987328565560711e-06, "loss": 0.007, "num_tokens": 213829686.0, "step": 478 }, { "epoch": 4.698646986469865, "grad_norm": 0.05168917084897981, "learning_rate": 1.0928600657409751e-06, "loss": 0.007, "num_tokens": 214277322.0, "step": 479 }, { "epoch": 4.708487084870849, "grad_norm": 0.1608212490455815, "learning_rate": 1.0871655133050372e-06, "loss": 0.1771, "num_tokens": 214763908.0, "step": 480 }, { "epoch": 4.718327183271833, "grad_norm": 0.06395186276533307, "learning_rate": 1.081649429553581e-06, "loss": 0.0096, "num_tokens": 215194438.0, "step": 481 }, { "epoch": 4.728167281672817, "grad_norm": 0.05084061327021202, "learning_rate": 1.076312037574106e-06, "loss": 0.0075, "num_tokens": 215654297.0, "step": 482 }, { "epoch": 4.7380073800738005, "grad_norm": 0.05497766832460128, "learning_rate": 1.0711535532272632e-06, "loss": 0.0075, "num_tokens": 216086613.0, "step": 483 }, { "epoch": 4.747847478474784, "grad_norm": 0.055956248844709046, "learning_rate": 1.0661741851381256e-06, "loss": 0.0076, "num_tokens": 216536848.0, "step": 484 }, { "epoch": 4.757687576875769, "grad_norm": 0.05660105878886696, "learning_rate": 1.0613741346877498e-06, "loss": 0.0074, "num_tokens": 216997743.0, "step": 485 }, { "epoch": 4.767527675276753, "grad_norm": 0.06132538554048896, "learning_rate": 1.056753596005032e-06, "loss": 0.0072, "num_tokens": 217458807.0, "step": 486 }, { "epoch": 4.777367773677737, "grad_norm": 0.055422872986755904, "learning_rate": 1.0523127559588579e-06, "loss": 0.0077, "num_tokens": 217917380.0, "step": 487 }, { "epoch": 4.787207872078721, "grad_norm": 0.05522528856299787, "learning_rate": 1.0480517941505428e-06, "loss": 0.0073, "num_tokens": 218379436.0, "step": 488 }, { "epoch": 4.797047970479705, "grad_norm": 0.06958848204521575, "learning_rate": 1.0439708829065708e-06, "loss": 0.0077, "num_tokens": 218813896.0, "step": 489 }, { "epoch": 4.8068880688806885, "grad_norm": 0.06386162989494891, "learning_rate": 1.0400701872716227e-06, "loss": 0.0073, "num_tokens": 219264589.0, "step": 490 }, { "epoch": 4.816728167281672, "grad_norm": 0.05787632062480919, "learning_rate": 1.0363498650019023e-06, "loss": 0.0076, "num_tokens": 219704673.0, "step": 491 }, { "epoch": 4.826568265682657, "grad_norm": 0.06292874524087964, "learning_rate": 1.0328100665587573e-06, "loss": 0.0127, "num_tokens": 220132557.0, "step": 492 }, { "epoch": 4.836408364083641, "grad_norm": 0.05664304451978608, "learning_rate": 1.029450935102592e-06, "loss": 0.0074, "num_tokens": 220586640.0, "step": 493 }, { "epoch": 4.846248462484625, "grad_norm": 0.06939065112948692, "learning_rate": 1.0262726064870801e-06, "loss": 0.0085, "num_tokens": 221016982.0, "step": 494 }, { "epoch": 4.856088560885609, "grad_norm": 0.052898898903196025, "learning_rate": 1.0232752092536666e-06, "loss": 0.0074, "num_tokens": 221462667.0, "step": 495 }, { "epoch": 4.865928659286593, "grad_norm": 0.05776273104035291, "learning_rate": 1.0204588646263731e-06, "loss": 0.0071, "num_tokens": 221913595.0, "step": 496 }, { "epoch": 4.875768757687577, "grad_norm": 0.05461381324686126, "learning_rate": 1.0178236865068933e-06, "loss": 0.0074, "num_tokens": 222326904.0, "step": 497 }, { "epoch": 4.885608856088561, "grad_norm": 0.055498699411421645, "learning_rate": 1.0153697814699858e-06, "loss": 0.0072, "num_tokens": 222774557.0, "step": 498 }, { "epoch": 4.895448954489545, "grad_norm": 0.053031884611377415, "learning_rate": 1.0130972487591658e-06, "loss": 0.0068, "num_tokens": 223236906.0, "step": 499 }, { "epoch": 4.905289052890529, "grad_norm": 0.054483452664515604, "learning_rate": 1.0110061802826889e-06, "loss": 0.0074, "num_tokens": 223673165.0, "step": 500 }, { "epoch": 4.915129151291513, "grad_norm": 0.05035207860411025, "learning_rate": 1.009096660609837e-06, "loss": 0.0068, "num_tokens": 224139613.0, "step": 501 }, { "epoch": 4.924969249692497, "grad_norm": 0.05491435698714458, "learning_rate": 1.0073687669674949e-06, "loss": 0.0108, "num_tokens": 224623785.0, "step": 502 }, { "epoch": 4.934809348093481, "grad_norm": 0.05816593959272977, "learning_rate": 1.0058225692370299e-06, "loss": 0.0074, "num_tokens": 225060586.0, "step": 503 }, { "epoch": 4.944649446494465, "grad_norm": 0.05535566978108373, "learning_rate": 1.0044581299514638e-06, "loss": 0.0073, "num_tokens": 225504012.0, "step": 504 }, { "epoch": 4.9544895448954485, "grad_norm": 0.053032498372300854, "learning_rate": 1.003275504292944e-06, "loss": 0.0073, "num_tokens": 225944573.0, "step": 505 }, { "epoch": 4.964329643296433, "grad_norm": 0.051644596347508585, "learning_rate": 1.0022747400905126e-06, "loss": 0.0072, "num_tokens": 226383598.0, "step": 506 }, { "epoch": 4.974169741697417, "grad_norm": 0.05140289748972471, "learning_rate": 1.0014558778181714e-06, "loss": 0.0067, "num_tokens": 226851896.0, "step": 507 }, { "epoch": 4.984009840098401, "grad_norm": 0.05762796975847249, "learning_rate": 1.0008189505932444e-06, "loss": 0.0077, "num_tokens": 227269800.0, "step": 508 }, { "epoch": 4.993849938499385, "grad_norm": 0.05745101659790941, "learning_rate": 1.0003639841750404e-06, "loss": 0.007, "num_tokens": 227720300.0, "step": 509 }, { "epoch": 5.0, "grad_norm": 0.0720275822622563, "learning_rate": 1.0000909969638097e-06, "loss": 0.007, "num_tokens": 227931822.0, "step": 510 }, { "epoch": 5.0, "step": 510, "total_flos": 7.699310209956577e+17, "train_loss": 0.04971654405360859, "train_runtime": 7469.8311, "train_samples_per_second": 8.7, "train_steps_per_second": 0.068 } ], "logging_steps": 1, "max_steps": 510, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.699310209956577e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }