{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9872340425531916, "eval_steps": 500, "global_step": 234, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01276595744680851, "grad_norm": 6.433622804272975, "learning_rate": 4.1666666666666667e-07, "loss": 1.2027, "step": 1 }, { "epoch": 0.02553191489361702, "grad_norm": 6.406146610066624, "learning_rate": 8.333333333333333e-07, "loss": 1.1984, "step": 2 }, { "epoch": 0.03829787234042553, "grad_norm": 6.369133763179086, "learning_rate": 1.25e-06, "loss": 1.1839, "step": 3 }, { "epoch": 0.05106382978723404, "grad_norm": 6.239688951706421, "learning_rate": 1.6666666666666667e-06, "loss": 1.2008, "step": 4 }, { "epoch": 0.06382978723404255, "grad_norm": 5.725473695055957, "learning_rate": 2.0833333333333334e-06, "loss": 1.1594, "step": 5 }, { "epoch": 0.07659574468085106, "grad_norm": 4.830315030136056, "learning_rate": 2.5e-06, "loss": 1.1631, "step": 6 }, { "epoch": 0.08936170212765958, "grad_norm": 4.369137536951095, "learning_rate": 2.916666666666667e-06, "loss": 1.1432, "step": 7 }, { "epoch": 0.10212765957446808, "grad_norm": 2.72542957762778, "learning_rate": 3.3333333333333333e-06, "loss": 1.0909, "step": 8 }, { "epoch": 0.1148936170212766, "grad_norm": 2.4979340740744864, "learning_rate": 3.7500000000000005e-06, "loss": 1.0602, "step": 9 }, { "epoch": 0.1276595744680851, "grad_norm": 2.3755723671193927, "learning_rate": 4.166666666666667e-06, "loss": 1.065, "step": 10 }, { "epoch": 0.14042553191489363, "grad_norm": 4.2839335365862965, "learning_rate": 4.583333333333333e-06, "loss": 1.0832, "step": 11 }, { "epoch": 0.15319148936170213, "grad_norm": 4.116111912140754, "learning_rate": 5e-06, "loss": 1.0428, "step": 12 }, { "epoch": 0.16595744680851063, "grad_norm": 4.005685697811548, "learning_rate": 5.416666666666667e-06, "loss": 1.0284, "step": 13 }, { "epoch": 0.17872340425531916, "grad_norm": 3.223331293112425, "learning_rate": 5.833333333333334e-06, "loss": 1.0322, "step": 14 }, { "epoch": 0.19148936170212766, "grad_norm": 3.373013206868157, "learning_rate": 6.25e-06, "loss": 1.0332, "step": 15 }, { "epoch": 0.20425531914893616, "grad_norm": 2.720673997841046, "learning_rate": 6.666666666666667e-06, "loss": 0.941, "step": 16 }, { "epoch": 0.2170212765957447, "grad_norm": 2.3602360691078172, "learning_rate": 7.083333333333335e-06, "loss": 0.9753, "step": 17 }, { "epoch": 0.2297872340425532, "grad_norm": 2.123106209821333, "learning_rate": 7.500000000000001e-06, "loss": 0.9487, "step": 18 }, { "epoch": 0.2425531914893617, "grad_norm": 1.9375668256870149, "learning_rate": 7.916666666666667e-06, "loss": 0.9297, "step": 19 }, { "epoch": 0.2553191489361702, "grad_norm": 1.679662940193537, "learning_rate": 8.333333333333334e-06, "loss": 0.9346, "step": 20 }, { "epoch": 0.2680851063829787, "grad_norm": 1.5536294679138145, "learning_rate": 8.750000000000001e-06, "loss": 0.9435, "step": 21 }, { "epoch": 0.28085106382978725, "grad_norm": 1.5959235415078292, "learning_rate": 9.166666666666666e-06, "loss": 0.9159, "step": 22 }, { "epoch": 0.2936170212765957, "grad_norm": 1.3251855095591336, "learning_rate": 9.583333333333335e-06, "loss": 0.924, "step": 23 }, { "epoch": 0.30638297872340425, "grad_norm": 1.333931250234237, "learning_rate": 1e-05, "loss": 0.9087, "step": 24 }, { "epoch": 0.3191489361702128, "grad_norm": 1.0631172254893704, "learning_rate": 9.999440509051367e-06, "loss": 0.8619, "step": 25 }, { "epoch": 0.33191489361702126, "grad_norm": 1.1428090130087067, "learning_rate": 9.997762161417517e-06, "loss": 0.8663, "step": 26 }, { "epoch": 0.3446808510638298, "grad_norm": 0.9408863721323562, "learning_rate": 9.994965332706574e-06, "loss": 0.9178, "step": 27 }, { "epoch": 0.3574468085106383, "grad_norm": 0.8661346108852108, "learning_rate": 9.991050648838676e-06, "loss": 0.8742, "step": 28 }, { "epoch": 0.3702127659574468, "grad_norm": 1.0870551192596247, "learning_rate": 9.986018985905901e-06, "loss": 0.9193, "step": 29 }, { "epoch": 0.3829787234042553, "grad_norm": 0.8726063330094801, "learning_rate": 9.979871469976197e-06, "loss": 0.8831, "step": 30 }, { "epoch": 0.39574468085106385, "grad_norm": 0.7897053030050917, "learning_rate": 9.972609476841368e-06, "loss": 0.8744, "step": 31 }, { "epoch": 0.4085106382978723, "grad_norm": 0.8532203062862161, "learning_rate": 9.964234631709188e-06, "loss": 0.8444, "step": 32 }, { "epoch": 0.42127659574468085, "grad_norm": 0.9015264429234872, "learning_rate": 9.954748808839675e-06, "loss": 0.9029, "step": 33 }, { "epoch": 0.4340425531914894, "grad_norm": 0.9314825609693578, "learning_rate": 9.944154131125643e-06, "loss": 0.906, "step": 34 }, { "epoch": 0.44680851063829785, "grad_norm": 0.8857778600988849, "learning_rate": 9.932452969617607e-06, "loss": 0.8727, "step": 35 }, { "epoch": 0.4595744680851064, "grad_norm": 0.9102922225453883, "learning_rate": 9.91964794299315e-06, "loss": 0.8703, "step": 36 }, { "epoch": 0.4723404255319149, "grad_norm": 0.9373330088387606, "learning_rate": 9.905741916970863e-06, "loss": 0.8632, "step": 37 }, { "epoch": 0.4851063829787234, "grad_norm": 0.648113277413392, "learning_rate": 9.890738003669029e-06, "loss": 0.8709, "step": 38 }, { "epoch": 0.4978723404255319, "grad_norm": 0.8476328265586036, "learning_rate": 9.874639560909118e-06, "loss": 0.8641, "step": 39 }, { "epoch": 0.5106382978723404, "grad_norm": 0.9530645465852795, "learning_rate": 9.857450191464337e-06, "loss": 0.8873, "step": 40 }, { "epoch": 0.5234042553191489, "grad_norm": 0.8834502637259801, "learning_rate": 9.839173742253334e-06, "loss": 0.9088, "step": 41 }, { "epoch": 0.5361702127659574, "grad_norm": 0.6674564507015708, "learning_rate": 9.819814303479268e-06, "loss": 0.8836, "step": 42 }, { "epoch": 0.548936170212766, "grad_norm": 0.89233659474358, "learning_rate": 9.799376207714446e-06, "loss": 0.861, "step": 43 }, { "epoch": 0.5617021276595745, "grad_norm": 0.9657269278077453, "learning_rate": 9.777864028930705e-06, "loss": 0.8508, "step": 44 }, { "epoch": 0.574468085106383, "grad_norm": 0.8180964670581452, "learning_rate": 9.755282581475769e-06, "loss": 0.8545, "step": 45 }, { "epoch": 0.5872340425531914, "grad_norm": 0.6255777011672562, "learning_rate": 9.731636918995821e-06, "loss": 0.8117, "step": 46 }, { "epoch": 0.6, "grad_norm": 0.7667163975506532, "learning_rate": 9.706932333304518e-06, "loss": 0.8895, "step": 47 }, { "epoch": 0.6127659574468085, "grad_norm": 0.8020605434247621, "learning_rate": 9.681174353198687e-06, "loss": 0.854, "step": 48 }, { "epoch": 0.625531914893617, "grad_norm": 0.7153116420784209, "learning_rate": 9.654368743221022e-06, "loss": 0.8491, "step": 49 }, { "epoch": 0.6382978723404256, "grad_norm": 0.7632863572263763, "learning_rate": 9.626521502369984e-06, "loss": 0.8296, "step": 50 }, { "epoch": 0.6510638297872341, "grad_norm": 0.7156700482115346, "learning_rate": 9.597638862757255e-06, "loss": 0.8269, "step": 51 }, { "epoch": 0.6638297872340425, "grad_norm": 0.7419920177521214, "learning_rate": 9.567727288213005e-06, "loss": 0.8732, "step": 52 }, { "epoch": 0.676595744680851, "grad_norm": 0.7274084462685031, "learning_rate": 9.536793472839325e-06, "loss": 0.8571, "step": 53 }, { "epoch": 0.6893617021276596, "grad_norm": 0.8071133486525568, "learning_rate": 9.504844339512096e-06, "loss": 0.8618, "step": 54 }, { "epoch": 0.7021276595744681, "grad_norm": 0.7040439538018551, "learning_rate": 9.471887038331686e-06, "loss": 0.8062, "step": 55 }, { "epoch": 0.7148936170212766, "grad_norm": 0.6461644492867274, "learning_rate": 9.437928945022772e-06, "loss": 0.8151, "step": 56 }, { "epoch": 0.7276595744680852, "grad_norm": 0.7749362259461846, "learning_rate": 9.40297765928369e-06, "loss": 0.8244, "step": 57 }, { "epoch": 0.7404255319148936, "grad_norm": 0.6824545661899104, "learning_rate": 9.36704100308565e-06, "loss": 0.8576, "step": 58 }, { "epoch": 0.7531914893617021, "grad_norm": 0.7432671437330083, "learning_rate": 9.330127018922195e-06, "loss": 0.8034, "step": 59 }, { "epoch": 0.7659574468085106, "grad_norm": 0.7600556786309642, "learning_rate": 9.292243968009332e-06, "loss": 0.8458, "step": 60 }, { "epoch": 0.7787234042553192, "grad_norm": 0.6265580676056108, "learning_rate": 9.253400328436699e-06, "loss": 0.861, "step": 61 }, { "epoch": 0.7914893617021277, "grad_norm": 0.6225560200815765, "learning_rate": 9.213604793270196e-06, "loss": 0.8604, "step": 62 }, { "epoch": 0.8042553191489362, "grad_norm": 0.8810350842991577, "learning_rate": 9.172866268606514e-06, "loss": 0.8427, "step": 63 }, { "epoch": 0.8170212765957446, "grad_norm": 0.6513841847105947, "learning_rate": 9.131193871579975e-06, "loss": 0.8028, "step": 64 }, { "epoch": 0.8297872340425532, "grad_norm": 0.6197128493703077, "learning_rate": 9.088596928322158e-06, "loss": 0.8656, "step": 65 }, { "epoch": 0.8425531914893617, "grad_norm": 0.6815619030192367, "learning_rate": 9.045084971874738e-06, "loss": 0.865, "step": 66 }, { "epoch": 0.8553191489361702, "grad_norm": 0.6559308467623148, "learning_rate": 9.000667740056033e-06, "loss": 0.8462, "step": 67 }, { "epoch": 0.8680851063829788, "grad_norm": 0.6623712768744643, "learning_rate": 8.955355173281709e-06, "loss": 0.819, "step": 68 }, { "epoch": 0.8808510638297873, "grad_norm": 0.6180928952593647, "learning_rate": 8.90915741234015e-06, "loss": 0.7912, "step": 69 }, { "epoch": 0.8936170212765957, "grad_norm": 0.6273034711494855, "learning_rate": 8.862084796122998e-06, "loss": 0.8346, "step": 70 }, { "epoch": 0.9063829787234042, "grad_norm": 0.5954230015766475, "learning_rate": 8.814147859311333e-06, "loss": 0.8285, "step": 71 }, { "epoch": 0.9191489361702128, "grad_norm": 0.6074571433084066, "learning_rate": 8.765357330018056e-06, "loss": 0.846, "step": 72 }, { "epoch": 0.9319148936170213, "grad_norm": 0.6369989484635978, "learning_rate": 8.715724127386971e-06, "loss": 0.8523, "step": 73 }, { "epoch": 0.9446808510638298, "grad_norm": 0.7075912076720394, "learning_rate": 8.665259359149132e-06, "loss": 0.8437, "step": 74 }, { "epoch": 0.9574468085106383, "grad_norm": 0.6088301978445659, "learning_rate": 8.613974319136959e-06, "loss": 0.8301, "step": 75 }, { "epoch": 0.9702127659574468, "grad_norm": 0.6095867804621813, "learning_rate": 8.561880484756726e-06, "loss": 0.8453, "step": 76 }, { "epoch": 0.9829787234042553, "grad_norm": 0.5939945482764697, "learning_rate": 8.508989514419959e-06, "loss": 0.8019, "step": 77 }, { "epoch": 0.9957446808510638, "grad_norm": 0.6748168678734431, "learning_rate": 8.455313244934324e-06, "loss": 0.842, "step": 78 }, { "epoch": 1.0085106382978724, "grad_norm": 1.547759173938074, "learning_rate": 8.400863688854598e-06, "loss": 1.3859, "step": 79 }, { "epoch": 1.0212765957446808, "grad_norm": 0.5834496082903455, "learning_rate": 8.345653031794292e-06, "loss": 0.6686, "step": 80 }, { "epoch": 1.0340425531914894, "grad_norm": 0.8717786274161273, "learning_rate": 8.289693629698564e-06, "loss": 0.8405, "step": 81 }, { "epoch": 1.0468085106382978, "grad_norm": 0.6157547818545464, "learning_rate": 8.232998006078998e-06, "loss": 0.82, "step": 82 }, { "epoch": 1.0595744680851065, "grad_norm": 0.7130510637018735, "learning_rate": 8.175578849210894e-06, "loss": 0.7519, "step": 83 }, { "epoch": 1.0723404255319149, "grad_norm": 0.690303000029393, "learning_rate": 8.117449009293668e-06, "loss": 0.751, "step": 84 }, { "epoch": 1.0851063829787233, "grad_norm": 0.7079577745236357, "learning_rate": 8.058621495575032e-06, "loss": 0.7809, "step": 85 }, { "epoch": 1.097872340425532, "grad_norm": 0.6652157947641348, "learning_rate": 7.99910947343957e-06, "loss": 0.7239, "step": 86 }, { "epoch": 1.1106382978723404, "grad_norm": 0.6609980037072015, "learning_rate": 7.938926261462366e-06, "loss": 0.7185, "step": 87 }, { "epoch": 1.123404255319149, "grad_norm": 0.6572439615475121, "learning_rate": 7.87808532842837e-06, "loss": 0.7618, "step": 88 }, { "epoch": 1.1361702127659574, "grad_norm": 0.6524069711521925, "learning_rate": 7.81660029031811e-06, "loss": 0.7565, "step": 89 }, { "epoch": 1.148936170212766, "grad_norm": 0.7267875020327975, "learning_rate": 7.754484907260513e-06, "loss": 0.7626, "step": 90 }, { "epoch": 1.1617021276595745, "grad_norm": 0.6412064880318965, "learning_rate": 7.691753080453413e-06, "loss": 0.7504, "step": 91 }, { "epoch": 1.174468085106383, "grad_norm": 0.6837901280994855, "learning_rate": 7.628418849052523e-06, "loss": 0.7695, "step": 92 }, { "epoch": 1.1872340425531915, "grad_norm": 0.6871789853177429, "learning_rate": 7.564496387029532e-06, "loss": 0.8094, "step": 93 }, { "epoch": 1.2, "grad_norm": 0.6238936487998985, "learning_rate": 7.500000000000001e-06, "loss": 0.7346, "step": 94 }, { "epoch": 1.2127659574468086, "grad_norm": 0.6150321617049529, "learning_rate": 7.434944122021837e-06, "loss": 0.8103, "step": 95 }, { "epoch": 1.225531914893617, "grad_norm": 0.7953351451984426, "learning_rate": 7.369343312364994e-06, "loss": 0.7658, "step": 96 }, { "epoch": 1.2382978723404254, "grad_norm": 0.6687467735647058, "learning_rate": 7.303212252253163e-06, "loss": 0.7674, "step": 97 }, { "epoch": 1.251063829787234, "grad_norm": 0.6041472836507435, "learning_rate": 7.236565741578163e-06, "loss": 0.7526, "step": 98 }, { "epoch": 1.2638297872340425, "grad_norm": 0.637437469544626, "learning_rate": 7.169418695587791e-06, "loss": 0.7459, "step": 99 }, { "epoch": 1.2765957446808511, "grad_norm": 0.6756564518794657, "learning_rate": 7.101786141547829e-06, "loss": 0.7433, "step": 100 }, { "epoch": 1.2893617021276595, "grad_norm": 0.617925149676544, "learning_rate": 7.033683215379002e-06, "loss": 0.8091, "step": 101 }, { "epoch": 1.302127659574468, "grad_norm": 0.6331398323341669, "learning_rate": 6.965125158269619e-06, "loss": 0.7685, "step": 102 }, { "epoch": 1.3148936170212766, "grad_norm": 0.6588781402724724, "learning_rate": 6.896127313264643e-06, "loss": 0.7512, "step": 103 }, { "epoch": 1.327659574468085, "grad_norm": 0.5567840865056657, "learning_rate": 6.8267051218319766e-06, "loss": 0.7207, "step": 104 }, { "epoch": 1.3404255319148937, "grad_norm": 0.6450623261539175, "learning_rate": 6.7568741204067145e-06, "loss": 0.7743, "step": 105 }, { "epoch": 1.353191489361702, "grad_norm": 0.5959132949800964, "learning_rate": 6.686649936914151e-06, "loss": 0.7671, "step": 106 }, { "epoch": 1.3659574468085105, "grad_norm": 0.5845969623624405, "learning_rate": 6.616048287272301e-06, "loss": 0.6478, "step": 107 }, { "epoch": 1.3787234042553191, "grad_norm": 0.759276559832424, "learning_rate": 6.545084971874738e-06, "loss": 0.8928, "step": 108 }, { "epoch": 1.3914893617021278, "grad_norm": 0.656020228293016, "learning_rate": 6.473775872054522e-06, "loss": 0.8068, "step": 109 }, { "epoch": 1.4042553191489362, "grad_norm": 0.5760724781489475, "learning_rate": 6.402136946530014e-06, "loss": 0.6719, "step": 110 }, { "epoch": 1.4170212765957446, "grad_norm": 0.6909070411507484, "learning_rate": 6.330184227833376e-06, "loss": 0.7651, "step": 111 }, { "epoch": 1.4297872340425533, "grad_norm": 0.7063008528557656, "learning_rate": 6.257933818722544e-06, "loss": 0.8234, "step": 112 }, { "epoch": 1.4425531914893617, "grad_norm": 0.5299402625765229, "learning_rate": 6.185401888577488e-06, "loss": 0.7144, "step": 113 }, { "epoch": 1.4553191489361703, "grad_norm": 0.7426768197949125, "learning_rate": 6.112604669781572e-06, "loss": 0.8006, "step": 114 }, { "epoch": 1.4680851063829787, "grad_norm": 0.6609099273855246, "learning_rate": 6.039558454088796e-06, "loss": 0.69, "step": 115 }, { "epoch": 1.4808510638297872, "grad_norm": 0.7192549036207418, "learning_rate": 5.9662795889777666e-06, "loss": 0.8544, "step": 116 }, { "epoch": 1.4936170212765958, "grad_norm": 0.5884339069720301, "learning_rate": 5.892784473993184e-06, "loss": 0.7449, "step": 117 }, { "epoch": 1.5063829787234042, "grad_norm": 0.7350852245605305, "learning_rate": 5.819089557075689e-06, "loss": 0.7141, "step": 118 }, { "epoch": 1.5191489361702128, "grad_norm": 0.6979695732369594, "learning_rate": 5.745211330880872e-06, "loss": 0.8249, "step": 119 }, { "epoch": 1.5319148936170213, "grad_norm": 0.5235553438939273, "learning_rate": 5.671166329088278e-06, "loss": 0.7202, "step": 120 }, { "epoch": 1.5446808510638297, "grad_norm": 0.682351171653146, "learning_rate": 5.596971122701221e-06, "loss": 0.7705, "step": 121 }, { "epoch": 1.5574468085106383, "grad_norm": 0.5620126307246519, "learning_rate": 5.522642316338268e-06, "loss": 0.7026, "step": 122 }, { "epoch": 1.570212765957447, "grad_norm": 0.5839082634964239, "learning_rate": 5.448196544517168e-06, "loss": 0.7979, "step": 123 }, { "epoch": 1.5829787234042554, "grad_norm": 0.5350002506195382, "learning_rate": 5.373650467932122e-06, "loss": 0.7143, "step": 124 }, { "epoch": 1.5957446808510638, "grad_norm": 0.6946599115440883, "learning_rate": 5.299020769725172e-06, "loss": 0.7873, "step": 125 }, { "epoch": 1.6085106382978722, "grad_norm": 0.5748638691181778, "learning_rate": 5.224324151752575e-06, "loss": 0.7365, "step": 126 }, { "epoch": 1.6212765957446809, "grad_norm": 0.6152731198473932, "learning_rate": 5.1495773308469935e-06, "loss": 0.8644, "step": 127 }, { "epoch": 1.6340425531914895, "grad_norm": 0.5381156315117113, "learning_rate": 5.074797035076319e-06, "loss": 0.645, "step": 128 }, { "epoch": 1.646808510638298, "grad_norm": 0.6120012511222306, "learning_rate": 5e-06, "loss": 0.7568, "step": 129 }, { "epoch": 1.6595744680851063, "grad_norm": 0.6206112452727576, "learning_rate": 4.9252029649236835e-06, "loss": 0.7629, "step": 130 }, { "epoch": 1.6723404255319148, "grad_norm": 0.5874975559118444, "learning_rate": 4.850422669153009e-06, "loss": 0.7951, "step": 131 }, { "epoch": 1.6851063829787234, "grad_norm": 0.585196228318139, "learning_rate": 4.775675848247427e-06, "loss": 0.742, "step": 132 }, { "epoch": 1.697872340425532, "grad_norm": 0.6903440231119622, "learning_rate": 4.700979230274829e-06, "loss": 0.885, "step": 133 }, { "epoch": 1.7106382978723405, "grad_norm": 0.5667409120364741, "learning_rate": 4.626349532067879e-06, "loss": 0.6209, "step": 134 }, { "epoch": 1.7234042553191489, "grad_norm": 0.5497052698552617, "learning_rate": 4.551803455482833e-06, "loss": 0.7317, "step": 135 }, { "epoch": 1.7361702127659573, "grad_norm": 0.5684834730769993, "learning_rate": 4.477357683661734e-06, "loss": 0.7886, "step": 136 }, { "epoch": 1.748936170212766, "grad_norm": 0.6194533934703746, "learning_rate": 4.4030288772987795e-06, "loss": 0.6998, "step": 137 }, { "epoch": 1.7617021276595746, "grad_norm": 0.6080025275110034, "learning_rate": 4.3288336709117246e-06, "loss": 0.7685, "step": 138 }, { "epoch": 1.774468085106383, "grad_norm": 0.6667437872795133, "learning_rate": 4.254788669119127e-06, "loss": 0.8504, "step": 139 }, { "epoch": 1.7872340425531914, "grad_norm": 0.4781266342412309, "learning_rate": 4.180910442924312e-06, "loss": 0.6402, "step": 140 }, { "epoch": 1.8, "grad_norm": 0.6254395295795602, "learning_rate": 4.107215526006818e-06, "loss": 0.8257, "step": 141 }, { "epoch": 1.8127659574468085, "grad_norm": 0.6183136446673195, "learning_rate": 4.033720411022235e-06, "loss": 0.7743, "step": 142 }, { "epoch": 1.825531914893617, "grad_norm": 0.5212997216190336, "learning_rate": 3.960441545911205e-06, "loss": 0.6946, "step": 143 }, { "epoch": 1.8382978723404255, "grad_norm": 0.5098103809041011, "learning_rate": 3.887395330218429e-06, "loss": 0.7032, "step": 144 }, { "epoch": 1.851063829787234, "grad_norm": 0.5515247196630589, "learning_rate": 3.8145981114225135e-06, "loss": 0.7748, "step": 145 }, { "epoch": 1.8638297872340426, "grad_norm": 0.5394319928110296, "learning_rate": 3.7420661812774577e-06, "loss": 0.8035, "step": 146 }, { "epoch": 1.8765957446808512, "grad_norm": 0.5389311901179236, "learning_rate": 3.669815772166625e-06, "loss": 0.7535, "step": 147 }, { "epoch": 1.8893617021276596, "grad_norm": 0.5019557383141358, "learning_rate": 3.5978630534699873e-06, "loss": 0.7643, "step": 148 }, { "epoch": 1.902127659574468, "grad_norm": 0.5222037980369133, "learning_rate": 3.526224127945479e-06, "loss": 0.7609, "step": 149 }, { "epoch": 1.9148936170212765, "grad_norm": 0.504024959936413, "learning_rate": 3.4549150281252635e-06, "loss": 0.7115, "step": 150 }, { "epoch": 1.9276595744680851, "grad_norm": 0.5265230348715444, "learning_rate": 3.383951712727701e-06, "loss": 0.858, "step": 151 }, { "epoch": 1.9404255319148938, "grad_norm": 0.5287078624508609, "learning_rate": 3.3133500630858507e-06, "loss": 0.7184, "step": 152 }, { "epoch": 1.9531914893617022, "grad_norm": 0.5415420248763081, "learning_rate": 3.2431258795932863e-06, "loss": 0.7139, "step": 153 }, { "epoch": 1.9659574468085106, "grad_norm": 0.634512902103396, "learning_rate": 3.173294878168025e-06, "loss": 0.7809, "step": 154 }, { "epoch": 1.978723404255319, "grad_norm": 0.5487803141993715, "learning_rate": 3.1038726867353587e-06, "loss": 0.7562, "step": 155 }, { "epoch": 1.9914893617021276, "grad_norm": 0.6076957525111952, "learning_rate": 3.0348748417303826e-06, "loss": 0.76, "step": 156 }, { "epoch": 2.0042553191489363, "grad_norm": 1.2666145516000928, "learning_rate": 2.966316784621e-06, "loss": 1.1093, "step": 157 }, { "epoch": 2.0170212765957447, "grad_norm": 0.5783416608389275, "learning_rate": 2.8982138584521734e-06, "loss": 0.6804, "step": 158 }, { "epoch": 2.029787234042553, "grad_norm": 0.5339936797853089, "learning_rate": 2.83058130441221e-06, "loss": 0.7346, "step": 159 }, { "epoch": 2.0425531914893615, "grad_norm": 0.5121545171163273, "learning_rate": 2.7634342584218364e-06, "loss": 0.7322, "step": 160 }, { "epoch": 2.0553191489361704, "grad_norm": 0.5688805304516626, "learning_rate": 2.6967877477468394e-06, "loss": 0.6979, "step": 161 }, { "epoch": 2.068085106382979, "grad_norm": 0.5376619126851392, "learning_rate": 2.6306566876350072e-06, "loss": 0.7996, "step": 162 }, { "epoch": 2.0808510638297872, "grad_norm": 0.4976912103481773, "learning_rate": 2.5650558779781635e-06, "loss": 0.5683, "step": 163 }, { "epoch": 2.0936170212765957, "grad_norm": 0.49223792144287865, "learning_rate": 2.5000000000000015e-06, "loss": 0.6968, "step": 164 }, { "epoch": 2.106382978723404, "grad_norm": 0.5474656248665821, "learning_rate": 2.43550361297047e-06, "loss": 0.8232, "step": 165 }, { "epoch": 2.119148936170213, "grad_norm": 0.4800286329768494, "learning_rate": 2.371581150947476e-06, "loss": 0.6172, "step": 166 }, { "epoch": 2.1319148936170214, "grad_norm": 0.574376129911797, "learning_rate": 2.3082469195465893e-06, "loss": 0.7489, "step": 167 }, { "epoch": 2.1446808510638298, "grad_norm": 0.537439553766131, "learning_rate": 2.245515092739488e-06, "loss": 0.6874, "step": 168 }, { "epoch": 2.157446808510638, "grad_norm": 0.5254400699435705, "learning_rate": 2.1833997096818897e-06, "loss": 0.6266, "step": 169 }, { "epoch": 2.1702127659574466, "grad_norm": 0.5028076699072636, "learning_rate": 2.1219146715716332e-06, "loss": 0.6489, "step": 170 }, { "epoch": 2.1829787234042555, "grad_norm": 0.586944387365733, "learning_rate": 2.061073738537635e-06, "loss": 0.763, "step": 171 }, { "epoch": 2.195744680851064, "grad_norm": 0.505894086651978, "learning_rate": 2.0008905265604316e-06, "loss": 0.6465, "step": 172 }, { "epoch": 2.2085106382978723, "grad_norm": 0.53984278729039, "learning_rate": 1.941378504424968e-06, "loss": 0.6834, "step": 173 }, { "epoch": 2.2212765957446807, "grad_norm": 0.5072572058751608, "learning_rate": 1.8825509907063328e-06, "loss": 0.7395, "step": 174 }, { "epoch": 2.2340425531914896, "grad_norm": 0.515782601163649, "learning_rate": 1.8244211507891064e-06, "loss": 0.7249, "step": 175 }, { "epoch": 2.246808510638298, "grad_norm": 0.496774063368525, "learning_rate": 1.7670019939210025e-06, "loss": 0.7169, "step": 176 }, { "epoch": 2.2595744680851064, "grad_norm": 0.5168551876746815, "learning_rate": 1.7103063703014372e-06, "loss": 0.7966, "step": 177 }, { "epoch": 2.272340425531915, "grad_norm": 0.5301738065791539, "learning_rate": 1.6543469682057105e-06, "loss": 0.6413, "step": 178 }, { "epoch": 2.2851063829787233, "grad_norm": 0.5082369172245946, "learning_rate": 1.5991363111454023e-06, "loss": 0.6976, "step": 179 }, { "epoch": 2.297872340425532, "grad_norm": 0.4793427755174214, "learning_rate": 1.544686755065677e-06, "loss": 0.6723, "step": 180 }, { "epoch": 2.3106382978723405, "grad_norm": 0.4752348288080126, "learning_rate": 1.4910104855800429e-06, "loss": 0.7238, "step": 181 }, { "epoch": 2.323404255319149, "grad_norm": 0.5266752732027756, "learning_rate": 1.438119515243277e-06, "loss": 0.7487, "step": 182 }, { "epoch": 2.3361702127659574, "grad_norm": 0.4469475413548446, "learning_rate": 1.3860256808630429e-06, "loss": 0.6416, "step": 183 }, { "epoch": 2.348936170212766, "grad_norm": 0.4644626840971213, "learning_rate": 1.3347406408508695e-06, "loss": 0.7321, "step": 184 }, { "epoch": 2.3617021276595747, "grad_norm": 0.508650359767618, "learning_rate": 1.2842758726130283e-06, "loss": 0.7622, "step": 185 }, { "epoch": 2.374468085106383, "grad_norm": 0.5007379233254791, "learning_rate": 1.234642669981946e-06, "loss": 0.6713, "step": 186 }, { "epoch": 2.3872340425531915, "grad_norm": 0.49661728721534804, "learning_rate": 1.1858521406886674e-06, "loss": 0.7533, "step": 187 }, { "epoch": 2.4, "grad_norm": 0.5053348466732649, "learning_rate": 1.137915203877003e-06, "loss": 0.6469, "step": 188 }, { "epoch": 2.4127659574468083, "grad_norm": 0.5050705783638247, "learning_rate": 1.0908425876598512e-06, "loss": 0.6918, "step": 189 }, { "epoch": 2.425531914893617, "grad_norm": 0.4690909098789046, "learning_rate": 1.044644826718295e-06, "loss": 0.6792, "step": 190 }, { "epoch": 2.4382978723404256, "grad_norm": 0.5066403427414923, "learning_rate": 9.993322599439692e-07, "loss": 0.7389, "step": 191 }, { "epoch": 2.451063829787234, "grad_norm": 0.5085674433952515, "learning_rate": 9.549150281252633e-07, "loss": 0.6471, "step": 192 }, { "epoch": 2.4638297872340424, "grad_norm": 0.5306625504474757, "learning_rate": 9.114030716778433e-07, "loss": 0.7413, "step": 193 }, { "epoch": 2.476595744680851, "grad_norm": 0.4383722207579253, "learning_rate": 8.688061284200266e-07, "loss": 0.6261, "step": 194 }, { "epoch": 2.4893617021276597, "grad_norm": 0.46044975004169747, "learning_rate": 8.271337313934869e-07, "loss": 0.6867, "step": 195 }, { "epoch": 2.502127659574468, "grad_norm": 0.5149550991848054, "learning_rate": 7.863952067298042e-07, "loss": 0.7692, "step": 196 }, { "epoch": 2.5148936170212766, "grad_norm": 0.47405804721356093, "learning_rate": 7.465996715633028e-07, "loss": 0.6737, "step": 197 }, { "epoch": 2.527659574468085, "grad_norm": 0.4697866538815745, "learning_rate": 7.077560319906696e-07, "loss": 0.7463, "step": 198 }, { "epoch": 2.5404255319148934, "grad_norm": 0.45370547180350884, "learning_rate": 6.698729810778065e-07, "loss": 0.7835, "step": 199 }, { "epoch": 2.5531914893617023, "grad_norm": 0.447877218083466, "learning_rate": 6.329589969143518e-07, "loss": 0.7075, "step": 200 }, { "epoch": 2.5659574468085107, "grad_norm": 0.4851283761699553, "learning_rate": 5.9702234071631e-07, "loss": 0.7195, "step": 201 }, { "epoch": 2.578723404255319, "grad_norm": 0.4562237608290687, "learning_rate": 5.620710549772295e-07, "loss": 0.6318, "step": 202 }, { "epoch": 2.5914893617021275, "grad_norm": 0.5050620555976538, "learning_rate": 5.281129616683167e-07, "loss": 0.7642, "step": 203 }, { "epoch": 2.604255319148936, "grad_norm": 0.49378376497238, "learning_rate": 4.951556604879049e-07, "loss": 0.7023, "step": 204 }, { "epoch": 2.617021276595745, "grad_norm": 0.5128163218131276, "learning_rate": 4.632065271606756e-07, "loss": 0.7142, "step": 205 }, { "epoch": 2.629787234042553, "grad_norm": 0.4988207702725477, "learning_rate": 4.322727117869951e-07, "loss": 0.6562, "step": 206 }, { "epoch": 2.6425531914893616, "grad_norm": 0.4681720156840467, "learning_rate": 4.0236113724274716e-07, "loss": 0.7008, "step": 207 }, { "epoch": 2.65531914893617, "grad_norm": 0.4892617801756113, "learning_rate": 3.734784976300165e-07, "loss": 0.7654, "step": 208 }, { "epoch": 2.6680851063829785, "grad_norm": 0.5131199581603079, "learning_rate": 3.4563125677897936e-07, "loss": 0.6106, "step": 209 }, { "epoch": 2.6808510638297873, "grad_norm": 0.5061855484479584, "learning_rate": 3.18825646801314e-07, "loss": 0.8295, "step": 210 }, { "epoch": 2.6936170212765957, "grad_norm": 0.4532598152764269, "learning_rate": 2.930676666954846e-07, "loss": 0.6481, "step": 211 }, { "epoch": 2.706382978723404, "grad_norm": 0.46463132032117166, "learning_rate": 2.6836308100417874e-07, "loss": 0.642, "step": 212 }, { "epoch": 2.719148936170213, "grad_norm": 0.49012449300116195, "learning_rate": 2.447174185242324e-07, "loss": 0.7463, "step": 213 }, { "epoch": 2.731914893617021, "grad_norm": 0.4636477810079059, "learning_rate": 2.2213597106929608e-07, "loss": 0.6347, "step": 214 }, { "epoch": 2.74468085106383, "grad_norm": 0.47971257641576737, "learning_rate": 2.006237922855553e-07, "loss": 0.8141, "step": 215 }, { "epoch": 2.7574468085106383, "grad_norm": 0.4522928987229415, "learning_rate": 1.801856965207338e-07, "loss": 0.6098, "step": 216 }, { "epoch": 2.7702127659574467, "grad_norm": 0.4708467076318233, "learning_rate": 1.6082625774666793e-07, "loss": 0.7974, "step": 217 }, { "epoch": 2.7829787234042556, "grad_norm": 0.45260252635853443, "learning_rate": 1.4254980853566248e-07, "loss": 0.6051, "step": 218 }, { "epoch": 2.795744680851064, "grad_norm": 0.4772874712122134, "learning_rate": 1.253604390908819e-07, "loss": 0.7594, "step": 219 }, { "epoch": 2.8085106382978724, "grad_norm": 0.4379365885870637, "learning_rate": 1.0926199633097156e-07, "loss": 0.6206, "step": 220 }, { "epoch": 2.821276595744681, "grad_norm": 0.5353989006280159, "learning_rate": 9.42580830291373e-08, "loss": 0.8591, "step": 221 }, { "epoch": 2.8340425531914892, "grad_norm": 0.4388978526940481, "learning_rate": 8.035205700685167e-08, "loss": 0.6012, "step": 222 }, { "epoch": 2.846808510638298, "grad_norm": 0.4627380722362225, "learning_rate": 6.75470303823933e-08, "loss": 0.6869, "step": 223 }, { "epoch": 2.8595744680851065, "grad_norm": 0.5100171595604029, "learning_rate": 5.584586887435739e-08, "loss": 0.6985, "step": 224 }, { "epoch": 2.872340425531915, "grad_norm": 0.5163263210032598, "learning_rate": 4.52511911603265e-08, "loss": 0.7723, "step": 225 }, { "epoch": 2.8851063829787233, "grad_norm": 0.4703957778516729, "learning_rate": 3.576536829081323e-08, "loss": 0.6751, "step": 226 }, { "epoch": 2.8978723404255318, "grad_norm": 0.4353996932568037, "learning_rate": 2.7390523158633552e-08, "loss": 0.6796, "step": 227 }, { "epoch": 2.9106382978723406, "grad_norm": 0.5329335872850072, "learning_rate": 2.012853002380466e-08, "loss": 0.7351, "step": 228 }, { "epoch": 2.923404255319149, "grad_norm": 0.5187235806914524, "learning_rate": 1.3981014094099354e-08, "loss": 0.6504, "step": 229 }, { "epoch": 2.9361702127659575, "grad_norm": 0.49115304636235096, "learning_rate": 8.949351161324227e-09, "loss": 0.6921, "step": 230 }, { "epoch": 2.948936170212766, "grad_norm": 0.5169642162297178, "learning_rate": 5.034667293427053e-09, "loss": 0.7357, "step": 231 }, { "epoch": 2.9617021276595743, "grad_norm": 0.467272931984639, "learning_rate": 2.237838582483387e-09, "loss": 0.6568, "step": 232 }, { "epoch": 2.974468085106383, "grad_norm": 0.47352407416977127, "learning_rate": 5.594909486328348e-10, "loss": 0.6961, "step": 233 }, { "epoch": 2.9872340425531916, "grad_norm": 0.46921973983708287, "learning_rate": 0.0, "loss": 0.6174, "step": 234 }, { "epoch": 2.9872340425531916, "step": 234, "total_flos": 182640614014976.0, "train_loss": 0.7958560710788792, "train_runtime": 3531.6623, "train_samples_per_second": 6.371, "train_steps_per_second": 0.066 } ], "logging_steps": 1, "max_steps": 234, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 182640614014976.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }