{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9975990396158463, "eval_steps": 500, "global_step": 312, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006402561024409764, "grad_norm": 13.680483919438638, "learning_rate": 6.25e-08, "loss": 0.7419, "step": 1 }, { "epoch": 0.012805122048819529, "grad_norm": 12.895832423301675, "learning_rate": 1.25e-07, "loss": 0.6763, "step": 2 }, { "epoch": 0.01920768307322929, "grad_norm": 12.171002116468559, "learning_rate": 1.875e-07, "loss": 0.6351, "step": 3 }, { "epoch": 0.025610244097639057, "grad_norm": 12.464785861868467, "learning_rate": 2.5e-07, "loss": 0.7288, "step": 4 }, { "epoch": 0.03201280512204882, "grad_norm": 10.388017670547901, "learning_rate": 3.1249999999999997e-07, "loss": 0.6076, "step": 5 }, { "epoch": 0.03841536614645858, "grad_norm": 12.079460028084922, "learning_rate": 3.75e-07, "loss": 0.6902, "step": 6 }, { "epoch": 0.04481792717086835, "grad_norm": 12.19876157935689, "learning_rate": 4.375e-07, "loss": 0.6948, "step": 7 }, { "epoch": 0.051220488195278115, "grad_norm": 9.756977949266757, "learning_rate": 5e-07, "loss": 0.7175, "step": 8 }, { "epoch": 0.057623049219687875, "grad_norm": 10.46968505857669, "learning_rate": 5.625e-07, "loss": 0.6654, "step": 9 }, { "epoch": 0.06402561024409764, "grad_norm": 8.571146829037712, "learning_rate": 6.249999999999999e-07, "loss": 0.6365, "step": 10 }, { "epoch": 0.07042817126850741, "grad_norm": 7.915474952782504, "learning_rate": 6.875e-07, "loss": 0.6584, "step": 11 }, { "epoch": 0.07683073229291716, "grad_norm": 8.017393974111991, "learning_rate": 7.5e-07, "loss": 0.6789, "step": 12 }, { "epoch": 0.08323329331732693, "grad_norm": 7.146421473324447, "learning_rate": 8.125e-07, "loss": 0.6551, "step": 13 }, { "epoch": 0.0896358543417367, "grad_norm": 6.490398797125461, "learning_rate": 8.75e-07, "loss": 0.5721, "step": 14 }, { "epoch": 0.09603841536614646, "grad_norm": 6.813176303739041, "learning_rate": 9.374999999999999e-07, "loss": 0.5933, "step": 15 }, { "epoch": 0.10244097639055623, "grad_norm": 6.452956439451391, "learning_rate": 1e-06, "loss": 0.5332, "step": 16 }, { "epoch": 0.10884353741496598, "grad_norm": 6.639180709933743, "learning_rate": 1.0625e-06, "loss": 0.5836, "step": 17 }, { "epoch": 0.11524609843937575, "grad_norm": 5.461791194778153, "learning_rate": 1.125e-06, "loss": 0.5577, "step": 18 }, { "epoch": 0.12164865946378552, "grad_norm": 5.930832649263125, "learning_rate": 1.1874999999999999e-06, "loss": 0.5337, "step": 19 }, { "epoch": 0.12805122048819528, "grad_norm": 4.956904837601893, "learning_rate": 1.2499999999999999e-06, "loss": 0.4669, "step": 20 }, { "epoch": 0.13445378151260504, "grad_norm": 6.256208371824237, "learning_rate": 1.3125e-06, "loss": 0.4811, "step": 21 }, { "epoch": 0.14085634253701482, "grad_norm": 5.246513837804897, "learning_rate": 1.375e-06, "loss": 0.4414, "step": 22 }, { "epoch": 0.14725890356142457, "grad_norm": 5.528561343115844, "learning_rate": 1.4375e-06, "loss": 0.4948, "step": 23 }, { "epoch": 0.15366146458583432, "grad_norm": 5.333224339149711, "learning_rate": 1.5e-06, "loss": 0.4281, "step": 24 }, { "epoch": 0.1600640256102441, "grad_norm": 4.9864846595798324, "learning_rate": 1.5624999999999999e-06, "loss": 0.4362, "step": 25 }, { "epoch": 0.16646658663465386, "grad_norm": 5.269516171255013, "learning_rate": 1.625e-06, "loss": 0.4036, "step": 26 }, { "epoch": 0.17286914765906364, "grad_norm": 4.932131322685013, "learning_rate": 1.6875e-06, "loss": 0.4038, "step": 27 }, { "epoch": 0.1792717086834734, "grad_norm": 5.318953799775683, "learning_rate": 1.75e-06, "loss": 0.4283, "step": 28 }, { "epoch": 0.18567426970788314, "grad_norm": 5.190854642160747, "learning_rate": 1.8125e-06, "loss": 0.4044, "step": 29 }, { "epoch": 0.19207683073229292, "grad_norm": 4.807806644055786, "learning_rate": 1.8749999999999998e-06, "loss": 0.4117, "step": 30 }, { "epoch": 0.19847939175670268, "grad_norm": 4.292846106324363, "learning_rate": 1.9375e-06, "loss": 0.3416, "step": 31 }, { "epoch": 0.20488195278111246, "grad_norm": 4.03903415559866, "learning_rate": 2e-06, "loss": 0.3794, "step": 32 }, { "epoch": 0.2112845138055222, "grad_norm": 4.493745802649108, "learning_rate": 1.9999370567547003e-06, "loss": 0.3952, "step": 33 }, { "epoch": 0.21768707482993196, "grad_norm": 4.645817807816507, "learning_rate": 1.9997482349425066e-06, "loss": 0.3743, "step": 34 }, { "epoch": 0.22408963585434175, "grad_norm": 5.0435551125438, "learning_rate": 1.9994335583335335e-06, "loss": 0.3525, "step": 35 }, { "epoch": 0.2304921968787515, "grad_norm": 4.788576656798642, "learning_rate": 1.9989930665413145e-06, "loss": 0.3865, "step": 36 }, { "epoch": 0.23689475790316125, "grad_norm": 4.390755031941915, "learning_rate": 1.9984268150178167e-06, "loss": 0.3295, "step": 37 }, { "epoch": 0.24329731892757103, "grad_norm": 4.772214941084568, "learning_rate": 1.997734875046456e-06, "loss": 0.3334, "step": 38 }, { "epoch": 0.24969987995198079, "grad_norm": 5.273946498985277, "learning_rate": 1.996917333733128e-06, "loss": 0.3726, "step": 39 }, { "epoch": 0.25610244097639057, "grad_norm": 5.369010852231794, "learning_rate": 1.995974293995239e-06, "loss": 0.3707, "step": 40 }, { "epoch": 0.26250500200080035, "grad_norm": 5.767047861871442, "learning_rate": 1.994905874548752e-06, "loss": 0.3604, "step": 41 }, { "epoch": 0.2689075630252101, "grad_norm": 5.503870643763186, "learning_rate": 1.9937122098932426e-06, "loss": 0.3746, "step": 42 }, { "epoch": 0.27531012404961985, "grad_norm": 6.3642035735044935, "learning_rate": 1.9923934502949643e-06, "loss": 0.3524, "step": 43 }, { "epoch": 0.28171268507402963, "grad_norm": 5.870101299422478, "learning_rate": 1.9909497617679347e-06, "loss": 0.3672, "step": 44 }, { "epoch": 0.28811524609843936, "grad_norm": 4.907548990312214, "learning_rate": 1.9893813260530367e-06, "loss": 0.3748, "step": 45 }, { "epoch": 0.29451780712284914, "grad_norm": 4.106972565193302, "learning_rate": 1.9876883405951377e-06, "loss": 0.3265, "step": 46 }, { "epoch": 0.3009203681472589, "grad_norm": 4.649037094549095, "learning_rate": 1.9858710185182355e-06, "loss": 0.2802, "step": 47 }, { "epoch": 0.30732292917166865, "grad_norm": 4.578190605072973, "learning_rate": 1.9839295885986295e-06, "loss": 0.3414, "step": 48 }, { "epoch": 0.3137254901960784, "grad_norm": 5.338755744928473, "learning_rate": 1.9818642952361183e-06, "loss": 0.3513, "step": 49 }, { "epoch": 0.3201280512204882, "grad_norm": 5.16732801301707, "learning_rate": 1.9796753984232355e-06, "loss": 0.3385, "step": 50 }, { "epoch": 0.32653061224489793, "grad_norm": 4.308178011035292, "learning_rate": 1.977363173712519e-06, "loss": 0.3337, "step": 51 }, { "epoch": 0.3329331732693077, "grad_norm": 4.047304578891064, "learning_rate": 1.9749279121818236e-06, "loss": 0.3536, "step": 52 }, { "epoch": 0.3393357342937175, "grad_norm": 4.189108068341681, "learning_rate": 1.9723699203976766e-06, "loss": 0.3007, "step": 53 }, { "epoch": 0.3457382953181273, "grad_norm": 4.150519885409928, "learning_rate": 1.9696895203766866e-06, "loss": 0.2981, "step": 54 }, { "epoch": 0.352140856342537, "grad_norm": 4.59545667540724, "learning_rate": 1.966887049545006e-06, "loss": 0.3121, "step": 55 }, { "epoch": 0.3585434173669468, "grad_norm": 3.9736439160316683, "learning_rate": 1.9639628606958534e-06, "loss": 0.3049, "step": 56 }, { "epoch": 0.36494597839135656, "grad_norm": 4.3054703674323, "learning_rate": 1.9609173219450997e-06, "loss": 0.3124, "step": 57 }, { "epoch": 0.3713485394157663, "grad_norm": 3.4948815561353537, "learning_rate": 1.9577508166849303e-06, "loss": 0.3013, "step": 58 }, { "epoch": 0.37775110044017607, "grad_norm": 3.741965125562426, "learning_rate": 1.9544637435355806e-06, "loss": 0.2894, "step": 59 }, { "epoch": 0.38415366146458585, "grad_norm": 4.280653888850941, "learning_rate": 1.9510565162951534e-06, "loss": 0.3134, "step": 60 }, { "epoch": 0.3905562224889956, "grad_norm": 4.405700671682634, "learning_rate": 1.947529563887529e-06, "loss": 0.3177, "step": 61 }, { "epoch": 0.39695878351340536, "grad_norm": 3.9394485247143636, "learning_rate": 1.9438833303083674e-06, "loss": 0.2966, "step": 62 }, { "epoch": 0.40336134453781514, "grad_norm": 4.389702795617689, "learning_rate": 1.9401182745692187e-06, "loss": 0.2903, "step": 63 }, { "epoch": 0.4097639055622249, "grad_norm": 3.5979658724911077, "learning_rate": 1.936234870639737e-06, "loss": 0.2953, "step": 64 }, { "epoch": 0.41616646658663464, "grad_norm": 4.47441718112865, "learning_rate": 1.9322336073880143e-06, "loss": 0.3248, "step": 65 }, { "epoch": 0.4225690276110444, "grad_norm": 3.9355652185440637, "learning_rate": 1.928114988519039e-06, "loss": 0.3093, "step": 66 }, { "epoch": 0.4289715886354542, "grad_norm": 4.580138491679913, "learning_rate": 1.9238795325112867e-06, "loss": 0.3471, "step": 67 }, { "epoch": 0.43537414965986393, "grad_norm": 4.264136088165448, "learning_rate": 1.9195277725514506e-06, "loss": 0.3202, "step": 68 }, { "epoch": 0.4417767106842737, "grad_norm": 3.955882298206603, "learning_rate": 1.91506025646732e-06, "loss": 0.3297, "step": 69 }, { "epoch": 0.4481792717086835, "grad_norm": 3.5995979170148513, "learning_rate": 1.9104775466588157e-06, "loss": 0.2737, "step": 70 }, { "epoch": 0.4545818327330932, "grad_norm": 5.25272090784733, "learning_rate": 1.905780220027194e-06, "loss": 0.2706, "step": 71 }, { "epoch": 0.460984393757503, "grad_norm": 4.341561414420844, "learning_rate": 1.9009688679024189e-06, "loss": 0.3079, "step": 72 }, { "epoch": 0.4673869547819128, "grad_norm": 3.6758492701618226, "learning_rate": 1.8960440959687252e-06, "loss": 0.2924, "step": 73 }, { "epoch": 0.4737895158063225, "grad_norm": 3.9451817967595386, "learning_rate": 1.8910065241883678e-06, "loss": 0.2971, "step": 74 }, { "epoch": 0.4801920768307323, "grad_norm": 4.96315413342626, "learning_rate": 1.8858567867235798e-06, "loss": 0.2673, "step": 75 }, { "epoch": 0.48659463785514206, "grad_norm": 4.139530587876671, "learning_rate": 1.8805955318567379e-06, "loss": 0.27, "step": 76 }, { "epoch": 0.49299719887955185, "grad_norm": 4.036729827911165, "learning_rate": 1.8752234219087537e-06, "loss": 0.3079, "step": 77 }, { "epoch": 0.49939975990396157, "grad_norm": 4.267304261790974, "learning_rate": 1.8697411331556953e-06, "loss": 0.2777, "step": 78 }, { "epoch": 0.5058023209283713, "grad_norm": 4.140171639449446, "learning_rate": 1.8641493557436548e-06, "loss": 0.2965, "step": 79 }, { "epoch": 0.5122048819527811, "grad_norm": 4.42365589599126, "learning_rate": 1.858448793601866e-06, "loss": 0.3452, "step": 80 }, { "epoch": 0.5186074429771909, "grad_norm": 4.2607097203243125, "learning_rate": 1.852640164354092e-06, "loss": 0.2703, "step": 81 }, { "epoch": 0.5250100040016007, "grad_norm": 3.9797822473229814, "learning_rate": 1.8467241992282841e-06, "loss": 0.2994, "step": 82 }, { "epoch": 0.5314125650260104, "grad_norm": 3.7553052637845035, "learning_rate": 1.8407016429645302e-06, "loss": 0.296, "step": 83 }, { "epoch": 0.5378151260504201, "grad_norm": 3.7344036316964893, "learning_rate": 1.8345732537213026e-06, "loss": 0.3052, "step": 84 }, { "epoch": 0.54421768707483, "grad_norm": 4.104658693180337, "learning_rate": 1.8283398029800164e-06, "loss": 0.2794, "step": 85 }, { "epoch": 0.5506202480992397, "grad_norm": 3.86258232609451, "learning_rate": 1.82200207544791e-06, "loss": 0.2729, "step": 86 }, { "epoch": 0.5570228091236494, "grad_norm": 3.5583118098950064, "learning_rate": 1.8155608689592601e-06, "loss": 0.2882, "step": 87 }, { "epoch": 0.5634253701480593, "grad_norm": 4.521351652684801, "learning_rate": 1.8090169943749474e-06, "loss": 0.3041, "step": 88 }, { "epoch": 0.569827931172469, "grad_norm": 3.403629696215942, "learning_rate": 1.802371275480378e-06, "loss": 0.2579, "step": 89 }, { "epoch": 0.5762304921968787, "grad_norm": 3.6982154812345396, "learning_rate": 1.795624548881781e-06, "loss": 0.2464, "step": 90 }, { "epoch": 0.5826330532212886, "grad_norm": 4.352958033159996, "learning_rate": 1.7887776639008912e-06, "loss": 0.2641, "step": 91 }, { "epoch": 0.5890356142456983, "grad_norm": 4.54700815993712, "learning_rate": 1.7818314824680298e-06, "loss": 0.3019, "step": 92 }, { "epoch": 0.595438175270108, "grad_norm": 3.789128443890756, "learning_rate": 1.774786879013601e-06, "loss": 0.2883, "step": 93 }, { "epoch": 0.6018407362945178, "grad_norm": 4.340181858310212, "learning_rate": 1.767644740358011e-06, "loss": 0.3271, "step": 94 }, { "epoch": 0.6082432973189276, "grad_norm": 4.181233142586761, "learning_rate": 1.760405965600031e-06, "loss": 0.3036, "step": 95 }, { "epoch": 0.6146458583433373, "grad_norm": 4.468139724866329, "learning_rate": 1.753071466003611e-06, "loss": 0.2808, "step": 96 }, { "epoch": 0.6210484193677471, "grad_norm": 4.510323988503172, "learning_rate": 1.7456421648831654e-06, "loss": 0.26, "step": 97 }, { "epoch": 0.6274509803921569, "grad_norm": 3.8787104679210507, "learning_rate": 1.7381189974873407e-06, "loss": 0.2888, "step": 98 }, { "epoch": 0.6338535414165666, "grad_norm": 4.459830521867393, "learning_rate": 1.7305029108812774e-06, "loss": 0.2464, "step": 99 }, { "epoch": 0.6402561024409764, "grad_norm": 4.709563601825434, "learning_rate": 1.7227948638273915e-06, "loss": 0.3026, "step": 100 }, { "epoch": 0.6466586634653861, "grad_norm": 3.7651337955677255, "learning_rate": 1.7149958266646754e-06, "loss": 0.2786, "step": 101 }, { "epoch": 0.6530612244897959, "grad_norm": 5.5973077659411405, "learning_rate": 1.7071067811865474e-06, "loss": 0.288, "step": 102 }, { "epoch": 0.6594637855142057, "grad_norm": 3.8873700424843127, "learning_rate": 1.6991287205172574e-06, "loss": 0.2722, "step": 103 }, { "epoch": 0.6658663465386154, "grad_norm": 4.2130449731998425, "learning_rate": 1.6910626489868648e-06, "loss": 0.2955, "step": 104 }, { "epoch": 0.6722689075630253, "grad_norm": 4.14817042922031, "learning_rate": 1.682909582004807e-06, "loss": 0.3185, "step": 105 }, { "epoch": 0.678671468587435, "grad_norm": 4.776345876515052, "learning_rate": 1.6746705459320744e-06, "loss": 0.3116, "step": 106 }, { "epoch": 0.6850740296118447, "grad_norm": 3.857374024007739, "learning_rate": 1.6663465779520037e-06, "loss": 0.3189, "step": 107 }, { "epoch": 0.6914765906362546, "grad_norm": 4.133483454926766, "learning_rate": 1.6579387259397126e-06, "loss": 0.3082, "step": 108 }, { "epoch": 0.6978791516606643, "grad_norm": 4.020769377918411, "learning_rate": 1.6494480483301835e-06, "loss": 0.2825, "step": 109 }, { "epoch": 0.704281712685074, "grad_norm": 4.261022360359079, "learning_rate": 1.640875613985024e-06, "loss": 0.2907, "step": 110 }, { "epoch": 0.7106842737094838, "grad_norm": 3.712351214158889, "learning_rate": 1.6322225020579096e-06, "loss": 0.2822, "step": 111 }, { "epoch": 0.7170868347338936, "grad_norm": 3.440740014931854, "learning_rate": 1.6234898018587336e-06, "loss": 0.2381, "step": 112 }, { "epoch": 0.7234893957583033, "grad_norm": 3.775626420708748, "learning_rate": 1.6146786127164771e-06, "loss": 0.2937, "step": 113 }, { "epoch": 0.7298919567827131, "grad_norm": 3.8211915298740666, "learning_rate": 1.6057900438408199e-06, "loss": 0.2843, "step": 114 }, { "epoch": 0.7362945178071229, "grad_norm": 3.752355539307679, "learning_rate": 1.5968252141825035e-06, "loss": 0.2648, "step": 115 }, { "epoch": 0.7426970788315326, "grad_norm": 3.842629249054385, "learning_rate": 1.587785252292473e-06, "loss": 0.27, "step": 116 }, { "epoch": 0.7490996398559424, "grad_norm": 4.166939902099283, "learning_rate": 1.578671296179806e-06, "loss": 0.2521, "step": 117 }, { "epoch": 0.7555022008803521, "grad_norm": 3.8837453592291844, "learning_rate": 1.569484493168452e-06, "loss": 0.2926, "step": 118 }, { "epoch": 0.7619047619047619, "grad_norm": 3.9849389096374837, "learning_rate": 1.5602259997528027e-06, "loss": 0.2415, "step": 119 }, { "epoch": 0.7683073229291717, "grad_norm": 3.8214588045054407, "learning_rate": 1.5508969814521024e-06, "loss": 0.2896, "step": 120 }, { "epoch": 0.7747098839535814, "grad_norm": 3.596004790144505, "learning_rate": 1.5414986126637257e-06, "loss": 0.25, "step": 121 }, { "epoch": 0.7811124449779911, "grad_norm": 3.7868659894236734, "learning_rate": 1.5320320765153365e-06, "loss": 0.2742, "step": 122 }, { "epoch": 0.787515006002401, "grad_norm": 3.605103510806505, "learning_rate": 1.5224985647159488e-06, "loss": 0.2288, "step": 123 }, { "epoch": 0.7939175670268107, "grad_norm": 4.164526361851514, "learning_rate": 1.5128992774059062e-06, "loss": 0.257, "step": 124 }, { "epoch": 0.8003201280512204, "grad_norm": 4.2601493772240095, "learning_rate": 1.5032354230058002e-06, "loss": 0.2703, "step": 125 }, { "epoch": 0.8067226890756303, "grad_norm": 4.230695549396448, "learning_rate": 1.4935082180643467e-06, "loss": 0.315, "step": 126 }, { "epoch": 0.81312525010004, "grad_norm": 4.029734026287984, "learning_rate": 1.4837188871052397e-06, "loss": 0.2587, "step": 127 }, { "epoch": 0.8195278111244498, "grad_norm": 4.418674847039931, "learning_rate": 1.4738686624729987e-06, "loss": 0.3162, "step": 128 }, { "epoch": 0.8259303721488596, "grad_norm": 4.0453350041861205, "learning_rate": 1.463958784177834e-06, "loss": 0.2534, "step": 129 }, { "epoch": 0.8323329331732693, "grad_norm": 4.6877576825765335, "learning_rate": 1.4539904997395467e-06, "loss": 0.2631, "step": 130 }, { "epoch": 0.8387354941976791, "grad_norm": 3.780029409396397, "learning_rate": 1.4439650640304821e-06, "loss": 0.271, "step": 131 }, { "epoch": 0.8451380552220888, "grad_norm": 3.6785546417231756, "learning_rate": 1.433883739117558e-06, "loss": 0.2418, "step": 132 }, { "epoch": 0.8515406162464986, "grad_norm": 3.396354715623399, "learning_rate": 1.4237477941033886e-06, "loss": 0.2499, "step": 133 }, { "epoch": 0.8579431772709084, "grad_norm": 3.7280690326153265, "learning_rate": 1.4135585049665206e-06, "loss": 0.2846, "step": 134 }, { "epoch": 0.8643457382953181, "grad_norm": 4.004231717394044, "learning_rate": 1.4033171544008051e-06, "loss": 0.2938, "step": 135 }, { "epoch": 0.8707482993197279, "grad_norm": 3.342930768359313, "learning_rate": 1.3930250316539235e-06, "loss": 0.2693, "step": 136 }, { "epoch": 0.8771508603441377, "grad_norm": 3.9327560661895244, "learning_rate": 1.3826834323650898e-06, "loss": 0.3064, "step": 137 }, { "epoch": 0.8835534213685474, "grad_norm": 3.8159537564044954, "learning_rate": 1.3722936584019451e-06, "loss": 0.2529, "step": 138 }, { "epoch": 0.8899559823929571, "grad_norm": 3.453933647883322, "learning_rate": 1.3618570176966722e-06, "loss": 0.2837, "step": 139 }, { "epoch": 0.896358543417367, "grad_norm": 3.7776942360972035, "learning_rate": 1.3513748240813427e-06, "loss": 0.3414, "step": 140 }, { "epoch": 0.9027611044417767, "grad_norm": 3.701115924801193, "learning_rate": 1.3408483971225249e-06, "loss": 0.2845, "step": 141 }, { "epoch": 0.9091636654661864, "grad_norm": 3.642984026753104, "learning_rate": 1.3302790619551672e-06, "loss": 0.2472, "step": 142 }, { "epoch": 0.9155662264905963, "grad_norm": 3.875796995615745, "learning_rate": 1.3196681491157816e-06, "loss": 0.2481, "step": 143 }, { "epoch": 0.921968787515006, "grad_norm": 3.8424117933387256, "learning_rate": 1.3090169943749473e-06, "loss": 0.2819, "step": 144 }, { "epoch": 0.9283713485394157, "grad_norm": 4.037093475071882, "learning_rate": 1.298326938569156e-06, "loss": 0.324, "step": 145 }, { "epoch": 0.9347739095638256, "grad_norm": 3.5202146156430447, "learning_rate": 1.2875993274320173e-06, "loss": 0.2459, "step": 146 }, { "epoch": 0.9411764705882353, "grad_norm": 3.740795759086208, "learning_rate": 1.2768355114248492e-06, "loss": 0.2443, "step": 147 }, { "epoch": 0.947579031612645, "grad_norm": 4.3061782207302866, "learning_rate": 1.266036845566675e-06, "loss": 0.3083, "step": 148 }, { "epoch": 0.9539815926370548, "grad_norm": 3.9442811474090775, "learning_rate": 1.2552046892636426e-06, "loss": 0.244, "step": 149 }, { "epoch": 0.9603841536614646, "grad_norm": 3.9865071324515027, "learning_rate": 1.244340406137894e-06, "loss": 0.2277, "step": 150 }, { "epoch": 0.9667867146858744, "grad_norm": 4.016047319687184, "learning_rate": 1.2334453638559054e-06, "loss": 0.2541, "step": 151 }, { "epoch": 0.9731892757102841, "grad_norm": 4.032894518578795, "learning_rate": 1.2225209339563143e-06, "loss": 0.2575, "step": 152 }, { "epoch": 0.9795918367346939, "grad_norm": 4.065384007721365, "learning_rate": 1.211568491677263e-06, "loss": 0.2638, "step": 153 }, { "epoch": 0.9859943977591037, "grad_norm": 4.082382918746455, "learning_rate": 1.2005894157832728e-06, "loss": 0.3124, "step": 154 }, { "epoch": 0.9923969587835134, "grad_norm": 4.690622898413357, "learning_rate": 1.1895850883916785e-06, "loss": 0.2434, "step": 155 }, { "epoch": 0.9987995198079231, "grad_norm": 4.05650193960916, "learning_rate": 1.1785568947986366e-06, "loss": 0.3047, "step": 156 }, { "epoch": 1.005202080832333, "grad_norm": 3.2193287198369993, "learning_rate": 1.1675062233047363e-06, "loss": 0.22, "step": 157 }, { "epoch": 1.0116046418567426, "grad_norm": 2.8522266075564024, "learning_rate": 1.156434465040231e-06, "loss": 0.2104, "step": 158 }, { "epoch": 1.0180072028811524, "grad_norm": 2.960293382016092, "learning_rate": 1.1453430137899128e-06, "loss": 0.224, "step": 159 }, { "epoch": 1.0244097639055623, "grad_norm": 3.1247052252152434, "learning_rate": 1.1342332658176555e-06, "loss": 0.2318, "step": 160 }, { "epoch": 1.0308123249299719, "grad_norm": 2.7511164775931163, "learning_rate": 1.123106619690643e-06, "loss": 0.2045, "step": 161 }, { "epoch": 1.0372148859543817, "grad_norm": 2.7289267777021027, "learning_rate": 1.1119644761033077e-06, "loss": 0.2038, "step": 162 }, { "epoch": 1.0436174469787916, "grad_norm": 2.713995483150959, "learning_rate": 1.1008082377010045e-06, "loss": 0.2129, "step": 163 }, { "epoch": 1.0500200080032012, "grad_norm": 2.78973076664929, "learning_rate": 1.0896393089034335e-06, "loss": 0.2182, "step": 164 }, { "epoch": 1.056422569027611, "grad_norm": 2.9643061675150038, "learning_rate": 1.078459095727845e-06, "loss": 0.2232, "step": 165 }, { "epoch": 1.0628251300520208, "grad_norm": 2.8380963314510113, "learning_rate": 1.0672690056120398e-06, "loss": 0.2227, "step": 166 }, { "epoch": 1.0692276910764307, "grad_norm": 2.687954696836688, "learning_rate": 1.0560704472371917e-06, "loss": 0.2114, "step": 167 }, { "epoch": 1.0756302521008403, "grad_norm": 2.9271042366476996, "learning_rate": 1.044864830350515e-06, "loss": 0.2059, "step": 168 }, { "epoch": 1.0820328131252501, "grad_norm": 3.0180594470719897, "learning_rate": 1.033653565587794e-06, "loss": 0.2217, "step": 169 }, { "epoch": 1.08843537414966, "grad_norm": 3.1673109290531154, "learning_rate": 1.022438064295805e-06, "loss": 0.2155, "step": 170 }, { "epoch": 1.0948379351740696, "grad_norm": 3.0766952480651164, "learning_rate": 1.0112197383546459e-06, "loss": 0.217, "step": 171 }, { "epoch": 1.1012404961984794, "grad_norm": 2.719327293458082, "learning_rate": 1e-06, "loss": 0.1822, "step": 172 }, { "epoch": 1.1076430572228892, "grad_norm": 2.770227862802081, "learning_rate": 9.88780261645354e-07, "loss": 0.1926, "step": 173 }, { "epoch": 1.1140456182472989, "grad_norm": 3.0009223093685704, "learning_rate": 9.77561935704195e-07, "loss": 0.1911, "step": 174 }, { "epoch": 1.1204481792717087, "grad_norm": 3.023185799043678, "learning_rate": 9.663464344122063e-07, "loss": 0.1903, "step": 175 }, { "epoch": 1.1268507402961185, "grad_norm": 3.457998950657755, "learning_rate": 9.551351696494853e-07, "loss": 0.251, "step": 176 }, { "epoch": 1.1332533013205282, "grad_norm": 2.9312099897929036, "learning_rate": 9.43929552762808e-07, "loss": 0.2176, "step": 177 }, { "epoch": 1.139655862344938, "grad_norm": 3.3046823347401175, "learning_rate": 9.327309943879603e-07, "loss": 0.2109, "step": 178 }, { "epoch": 1.1460584233693478, "grad_norm": 3.0921904822263078, "learning_rate": 9.215409042721551e-07, "loss": 0.22, "step": 179 }, { "epoch": 1.1524609843937574, "grad_norm": 2.991127231725485, "learning_rate": 9.103606910965665e-07, "loss": 0.188, "step": 180 }, { "epoch": 1.1588635454181673, "grad_norm": 3.281303486352065, "learning_rate": 8.991917622989955e-07, "loss": 0.2051, "step": 181 }, { "epoch": 1.165266106442577, "grad_norm": 3.2779897667339393, "learning_rate": 8.880355238966921e-07, "loss": 0.2128, "step": 182 }, { "epoch": 1.1716686674669867, "grad_norm": 2.908646079818215, "learning_rate": 8.768933803093572e-07, "loss": 0.211, "step": 183 }, { "epoch": 1.1780712284913966, "grad_norm": 3.511327611579083, "learning_rate": 8.657667341823448e-07, "loss": 0.2067, "step": 184 }, { "epoch": 1.1844737895158064, "grad_norm": 3.446334131319975, "learning_rate": 8.546569862100875e-07, "loss": 0.2126, "step": 185 }, { "epoch": 1.190876350540216, "grad_norm": 3.086417796365709, "learning_rate": 8.435655349597689e-07, "loss": 0.1961, "step": 186 }, { "epoch": 1.1972789115646258, "grad_norm": 3.486051214472594, "learning_rate": 8.324937766952636e-07, "loss": 0.1954, "step": 187 }, { "epoch": 1.2036814725890357, "grad_norm": 3.4725632158887625, "learning_rate": 8.214431052013634e-07, "loss": 0.2207, "step": 188 }, { "epoch": 1.2100840336134453, "grad_norm": 3.102428873715714, "learning_rate": 8.104149116083216e-07, "loss": 0.2167, "step": 189 }, { "epoch": 1.2164865946378551, "grad_norm": 3.2423731064102457, "learning_rate": 7.994105842167272e-07, "loss": 0.1924, "step": 190 }, { "epoch": 1.222889155662265, "grad_norm": 3.2545985252021152, "learning_rate": 7.884315083227372e-07, "loss": 0.2029, "step": 191 }, { "epoch": 1.2292917166866746, "grad_norm": 3.0971604028938517, "learning_rate": 7.774790660436857e-07, "loss": 0.1741, "step": 192 }, { "epoch": 1.2356942777110844, "grad_norm": 3.443986175823104, "learning_rate": 7.665546361440949e-07, "loss": 0.2451, "step": 193 }, { "epoch": 1.2420968387354943, "grad_norm": 3.3511781482482847, "learning_rate": 7.556595938621058e-07, "loss": 0.1843, "step": 194 }, { "epoch": 1.2484993997599039, "grad_norm": 3.4059727380620686, "learning_rate": 7.447953107363574e-07, "loss": 0.2354, "step": 195 }, { "epoch": 1.2549019607843137, "grad_norm": 3.2416330111232243, "learning_rate": 7.33963154433325e-07, "loss": 0.1955, "step": 196 }, { "epoch": 1.2613045218087235, "grad_norm": 4.2347778935869345, "learning_rate": 7.231644885751507e-07, "loss": 0.217, "step": 197 }, { "epoch": 1.2677070828331334, "grad_norm": 3.567306963297163, "learning_rate": 7.124006725679828e-07, "loss": 0.2212, "step": 198 }, { "epoch": 1.274109643857543, "grad_norm": 3.00005386027494, "learning_rate": 7.016730614308439e-07, "loss": 0.1926, "step": 199 }, { "epoch": 1.2805122048819528, "grad_norm": 3.414969994137627, "learning_rate": 6.909830056250526e-07, "loss": 0.2044, "step": 200 }, { "epoch": 1.2869147659063627, "grad_norm": 3.5101423542049868, "learning_rate": 6.803318508842186e-07, "loss": 0.1958, "step": 201 }, { "epoch": 1.2933173269307723, "grad_norm": 3.08086263335831, "learning_rate": 6.697209380448332e-07, "loss": 0.2239, "step": 202 }, { "epoch": 1.2997198879551821, "grad_norm": 3.6991398316157835, "learning_rate": 6.59151602877475e-07, "loss": 0.2019, "step": 203 }, { "epoch": 1.306122448979592, "grad_norm": 3.1898949896135953, "learning_rate": 6.486251759186572e-07, "loss": 0.2276, "step": 204 }, { "epoch": 1.3125250100040016, "grad_norm": 3.15209044029482, "learning_rate": 6.381429823033279e-07, "loss": 0.1788, "step": 205 }, { "epoch": 1.3189275710284114, "grad_norm": 3.2074734897831756, "learning_rate": 6.277063415980548e-07, "loss": 0.2434, "step": 206 }, { "epoch": 1.3253301320528212, "grad_norm": 2.875848443017429, "learning_rate": 6.173165676349102e-07, "loss": 0.1809, "step": 207 }, { "epoch": 1.3317326930772309, "grad_norm": 3.273739860976, "learning_rate": 6.069749683460764e-07, "loss": 0.2, "step": 208 }, { "epoch": 1.3381352541016407, "grad_norm": 3.4156670470458494, "learning_rate": 5.96682845599195e-07, "loss": 0.2044, "step": 209 }, { "epoch": 1.3445378151260505, "grad_norm": 4.24358157090511, "learning_rate": 5.864414950334795e-07, "loss": 0.194, "step": 210 }, { "epoch": 1.3509403761504601, "grad_norm": 3.116102100075619, "learning_rate": 5.762522058966113e-07, "loss": 0.2089, "step": 211 }, { "epoch": 1.35734293717487, "grad_norm": 2.8625746825319034, "learning_rate": 5.661162608824419e-07, "loss": 0.2083, "step": 212 }, { "epoch": 1.3637454981992798, "grad_norm": 3.2057903093169555, "learning_rate": 5.56034935969518e-07, "loss": 0.1952, "step": 213 }, { "epoch": 1.3701480592236894, "grad_norm": 2.669239875130929, "learning_rate": 5.460095002604532e-07, "loss": 0.2056, "step": 214 }, { "epoch": 1.3765506202480993, "grad_norm": 3.2441222068201605, "learning_rate": 5.36041215822166e-07, "loss": 0.209, "step": 215 }, { "epoch": 1.382953181272509, "grad_norm": 3.0334649559489724, "learning_rate": 5.261313375270013e-07, "loss": 0.1821, "step": 216 }, { "epoch": 1.3893557422969187, "grad_norm": 3.005021545247239, "learning_rate": 5.162811128947602e-07, "loss": 0.1919, "step": 217 }, { "epoch": 1.3957583033213286, "grad_norm": 3.0185970628882184, "learning_rate": 5.064917819356531e-07, "loss": 0.2124, "step": 218 }, { "epoch": 1.4021608643457384, "grad_norm": 2.9841034532353645, "learning_rate": 4.967645769941999e-07, "loss": 0.1751, "step": 219 }, { "epoch": 1.408563425370148, "grad_norm": 3.1214597510428783, "learning_rate": 4.871007225940939e-07, "loss": 0.2245, "step": 220 }, { "epoch": 1.4149659863945578, "grad_norm": 3.118933296947675, "learning_rate": 4.775014352840512e-07, "loss": 0.2171, "step": 221 }, { "epoch": 1.4213685474189677, "grad_norm": 3.1742771966166567, "learning_rate": 4.6796792348466353e-07, "loss": 0.2132, "step": 222 }, { "epoch": 1.4277711084433773, "grad_norm": 2.602593575781098, "learning_rate": 4.585013873362743e-07, "loss": 0.1799, "step": 223 }, { "epoch": 1.4341736694677871, "grad_norm": 2.94290229867875, "learning_rate": 4.4910301854789755e-07, "loss": 0.1815, "step": 224 }, { "epoch": 1.440576230492197, "grad_norm": 3.2300005719414204, "learning_rate": 4.397740002471972e-07, "loss": 0.2143, "step": 225 }, { "epoch": 1.4469787915166066, "grad_norm": 2.8425860620393943, "learning_rate": 4.3051550683154804e-07, "loss": 0.2251, "step": 226 }, { "epoch": 1.4533813525410164, "grad_norm": 2.7744967195870136, "learning_rate": 4.2132870382019427e-07, "loss": 0.2023, "step": 227 }, { "epoch": 1.4597839135654262, "grad_norm": 2.8299937805822384, "learning_rate": 4.1221474770752696e-07, "loss": 0.1825, "step": 228 }, { "epoch": 1.4661864745898359, "grad_norm": 2.842358054849413, "learning_rate": 4.031747858174964e-07, "loss": 0.1815, "step": 229 }, { "epoch": 1.4725890356142457, "grad_norm": 3.0046695988860064, "learning_rate": 3.942099561591802e-07, "loss": 0.1876, "step": 230 }, { "epoch": 1.4789915966386555, "grad_norm": 3.049335410632264, "learning_rate": 3.853213872835228e-07, "loss": 0.1965, "step": 231 }, { "epoch": 1.4853941576630652, "grad_norm": 2.9908629914418663, "learning_rate": 3.765101981412665e-07, "loss": 0.206, "step": 232 }, { "epoch": 1.491796718687475, "grad_norm": 3.6142447406768, "learning_rate": 3.677774979420903e-07, "loss": 0.1962, "step": 233 }, { "epoch": 1.4981992797118848, "grad_norm": 3.1083828790336265, "learning_rate": 3.5912438601497584e-07, "loss": 0.1907, "step": 234 }, { "epoch": 1.5046018407362944, "grad_norm": 3.8490048708212017, "learning_rate": 3.5055195166981646e-07, "loss": 0.2483, "step": 235 }, { "epoch": 1.5110044017607043, "grad_norm": 3.5135746675092574, "learning_rate": 3.420612740602874e-07, "loss": 0.2197, "step": 236 }, { "epoch": 1.517406962785114, "grad_norm": 2.9605758005174083, "learning_rate": 3.3365342204799606e-07, "loss": 0.2048, "step": 237 }, { "epoch": 1.5238095238095237, "grad_norm": 3.102724567495504, "learning_rate": 3.253294540679257e-07, "loss": 0.2163, "step": 238 }, { "epoch": 1.5302120848339336, "grad_norm": 3.338826460184625, "learning_rate": 3.170904179951931e-07, "loss": 0.1892, "step": 239 }, { "epoch": 1.5366146458583434, "grad_norm": 3.234763671406949, "learning_rate": 3.0893735101313535e-07, "loss": 0.2762, "step": 240 }, { "epoch": 1.543017206882753, "grad_norm": 2.812979283705818, "learning_rate": 3.008712794827426e-07, "loss": 0.1717, "step": 241 }, { "epoch": 1.5494197679071628, "grad_norm": 2.934584165405633, "learning_rate": 2.9289321881345254e-07, "loss": 0.2, "step": 242 }, { "epoch": 1.5558223289315727, "grad_norm": 3.017927185569003, "learning_rate": 2.850041733353247e-07, "loss": 0.1887, "step": 243 }, { "epoch": 1.5622248899559823, "grad_norm": 2.821836163783163, "learning_rate": 2.7720513617260855e-07, "loss": 0.1685, "step": 244 }, { "epoch": 1.5686274509803921, "grad_norm": 2.9956132520307692, "learning_rate": 2.6949708911872247e-07, "loss": 0.1944, "step": 245 }, { "epoch": 1.575030012004802, "grad_norm": 3.012268102569264, "learning_rate": 2.6188100251265943e-07, "loss": 0.2168, "step": 246 }, { "epoch": 1.5814325730292116, "grad_norm": 2.6551981273651646, "learning_rate": 2.543578351168344e-07, "loss": 0.1587, "step": 247 }, { "epoch": 1.5878351340536214, "grad_norm": 2.6439156715272913, "learning_rate": 2.4692853399638913e-07, "loss": 0.1889, "step": 248 }, { "epoch": 1.5942376950780313, "grad_norm": 2.9350723750826266, "learning_rate": 2.395940343999691e-07, "loss": 0.2073, "step": 249 }, { "epoch": 1.6006402561024409, "grad_norm": 3.0827120888341013, "learning_rate": 2.3235525964198888e-07, "loss": 0.1895, "step": 250 }, { "epoch": 1.6070428171268507, "grad_norm": 3.1686080831269483, "learning_rate": 2.252131209863991e-07, "loss": 0.2029, "step": 251 }, { "epoch": 1.6134453781512605, "grad_norm": 2.851390750231077, "learning_rate": 2.181685175319702e-07, "loss": 0.2061, "step": 252 }, { "epoch": 1.6198479391756702, "grad_norm": 3.094325341444059, "learning_rate": 2.11222336099109e-07, "loss": 0.192, "step": 253 }, { "epoch": 1.62625050020008, "grad_norm": 3.1464174144330026, "learning_rate": 2.043754511182191e-07, "loss": 0.2103, "step": 254 }, { "epoch": 1.6326530612244898, "grad_norm": 3.120376340739763, "learning_rate": 1.9762872451962208e-07, "loss": 0.1705, "step": 255 }, { "epoch": 1.6390556222488994, "grad_norm": 3.2179511708389037, "learning_rate": 1.9098300562505264e-07, "loss": 0.1948, "step": 256 }, { "epoch": 1.6454581832733093, "grad_norm": 2.8898679416961692, "learning_rate": 1.8443913104073982e-07, "loss": 0.1823, "step": 257 }, { "epoch": 1.6518607442977191, "grad_norm": 2.993303933297039, "learning_rate": 1.7799792455209016e-07, "loss": 0.2126, "step": 258 }, { "epoch": 1.6582633053221287, "grad_norm": 3.2103383308153925, "learning_rate": 1.716601970199836e-07, "loss": 0.1961, "step": 259 }, { "epoch": 1.6646658663465386, "grad_norm": 3.235948737487849, "learning_rate": 1.6542674627869734e-07, "loss": 0.2057, "step": 260 }, { "epoch": 1.6710684273709484, "grad_norm": 3.103836779762489, "learning_rate": 1.592983570354699e-07, "loss": 0.2186, "step": 261 }, { "epoch": 1.677470988395358, "grad_norm": 3.014947796395499, "learning_rate": 1.5327580077171588e-07, "loss": 0.1912, "step": 262 }, { "epoch": 1.6838735494197679, "grad_norm": 3.2700341051751245, "learning_rate": 1.473598356459078e-07, "loss": 0.2456, "step": 263 }, { "epoch": 1.6902761104441777, "grad_norm": 3.2091848048221308, "learning_rate": 1.415512063981339e-07, "loss": 0.2099, "step": 264 }, { "epoch": 1.6966786714685873, "grad_norm": 2.750474336934699, "learning_rate": 1.358506442563454e-07, "loss": 0.2159, "step": 265 }, { "epoch": 1.7030812324929971, "grad_norm": 4.614673424699257, "learning_rate": 1.3025886684430465e-07, "loss": 0.2067, "step": 266 }, { "epoch": 1.709483793517407, "grad_norm": 2.9058686083863514, "learning_rate": 1.2477657809124632e-07, "loss": 0.1992, "step": 267 }, { "epoch": 1.7158863545418166, "grad_norm": 2.862146073630623, "learning_rate": 1.19404468143262e-07, "loss": 0.2142, "step": 268 }, { "epoch": 1.7222889155662267, "grad_norm": 3.287005616915961, "learning_rate": 1.1414321327642019e-07, "loss": 0.2051, "step": 269 }, { "epoch": 1.7286914765906363, "grad_norm": 2.9357679152099383, "learning_rate": 1.089934758116322e-07, "loss": 0.2113, "step": 270 }, { "epoch": 1.7350940376150459, "grad_norm": 2.9564341258962683, "learning_rate": 1.0395590403127486e-07, "loss": 0.1711, "step": 271 }, { "epoch": 1.741496598639456, "grad_norm": 3.1982558699085484, "learning_rate": 9.903113209758096e-08, "loss": 0.2086, "step": 272 }, { "epoch": 1.7478991596638656, "grad_norm": 2.9531309947941566, "learning_rate": 9.421977997280594e-08, "loss": 0.1959, "step": 273 }, { "epoch": 1.7543017206882752, "grad_norm": 3.070946141827486, "learning_rate": 8.952245334118413e-08, "loss": 0.1975, "step": 274 }, { "epoch": 1.7607042817126852, "grad_norm": 2.758150815701359, "learning_rate": 8.493974353268019e-08, "loss": 0.1922, "step": 275 }, { "epoch": 1.7671068427370948, "grad_norm": 2.9185225875350858, "learning_rate": 8.047222744854942e-08, "loss": 0.1825, "step": 276 }, { "epoch": 1.7735094037615045, "grad_norm": 3.110148299633103, "learning_rate": 7.612046748871326e-08, "loss": 0.2096, "step": 277 }, { "epoch": 1.7799119647859145, "grad_norm": 3.1130566100683246, "learning_rate": 7.188501148096116e-08, "loss": 0.1733, "step": 278 }, { "epoch": 1.7863145258103241, "grad_norm": 3.1415180042361444, "learning_rate": 6.77663926119858e-08, "loss": 0.2043, "step": 279 }, { "epoch": 1.7927170868347337, "grad_norm": 3.3828521625639554, "learning_rate": 6.376512936026279e-08, "loss": 0.2248, "step": 280 }, { "epoch": 1.7991196478591438, "grad_norm": 3.0831187019400916, "learning_rate": 5.988172543078096e-08, "loss": 0.2238, "step": 281 }, { "epoch": 1.8055222088835534, "grad_norm": 3.583225680392213, "learning_rate": 5.611666969163242e-08, "loss": 0.2025, "step": 282 }, { "epoch": 1.811924769907963, "grad_norm": 2.7749576641359055, "learning_rate": 5.2470436112471264e-08, "loss": 0.1918, "step": 283 }, { "epoch": 1.818327330932373, "grad_norm": 3.491814950661689, "learning_rate": 4.8943483704846465e-08, "loss": 0.2017, "step": 284 }, { "epoch": 1.8247298919567827, "grad_norm": 3.3544823516715656, "learning_rate": 4.553625646441928e-08, "loss": 0.2182, "step": 285 }, { "epoch": 1.8311324529811923, "grad_norm": 3.03556242043361, "learning_rate": 4.224918331506955e-08, "loss": 0.2172, "step": 286 }, { "epoch": 1.8375350140056024, "grad_norm": 3.1512946254390815, "learning_rate": 3.908267805490051e-08, "loss": 0.2375, "step": 287 }, { "epoch": 1.843937575030012, "grad_norm": 3.1583349292530687, "learning_rate": 3.6037139304146756e-08, "loss": 0.2164, "step": 288 }, { "epoch": 1.8503401360544216, "grad_norm": 2.6885025755750007, "learning_rate": 3.3112950454993625e-08, "loss": 0.2124, "step": 289 }, { "epoch": 1.8567426970788317, "grad_norm": 2.899693820698532, "learning_rate": 3.0310479623313125e-08, "loss": 0.2094, "step": 290 }, { "epoch": 1.8631452581032413, "grad_norm": 2.9718624867551746, "learning_rate": 2.7630079602323443e-08, "loss": 0.1863, "step": 291 }, { "epoch": 1.8695478191276511, "grad_norm": 3.1641777461916396, "learning_rate": 2.507208781817638e-08, "loss": 0.253, "step": 292 }, { "epoch": 1.875950380152061, "grad_norm": 2.979215203416192, "learning_rate": 2.263682628748087e-08, "loss": 0.218, "step": 293 }, { "epoch": 1.8823529411764706, "grad_norm": 2.935051364990218, "learning_rate": 2.032460157676452e-08, "loss": 0.1981, "step": 294 }, { "epoch": 1.8887555022008804, "grad_norm": 2.818283326905153, "learning_rate": 1.8135704763881598e-08, "loss": 0.2028, "step": 295 }, { "epoch": 1.8951580632252902, "grad_norm": 3.2868954384023823, "learning_rate": 1.607041140137033e-08, "loss": 0.2051, "step": 296 }, { "epoch": 1.9015606242496998, "grad_norm": 2.948811609921979, "learning_rate": 1.4128981481764113e-08, "loss": 0.2096, "step": 297 }, { "epoch": 1.9079631852741097, "grad_norm": 2.6648158990098314, "learning_rate": 1.231165940486234e-08, "loss": 0.1701, "step": 298 }, { "epoch": 1.9143657462985195, "grad_norm": 2.828105773518058, "learning_rate": 1.0618673946963364e-08, "loss": 0.2012, "step": 299 }, { "epoch": 1.9207683073229291, "grad_norm": 3.20360356272725, "learning_rate": 9.050238232065299e-09, "loss": 0.1798, "step": 300 }, { "epoch": 1.927170868347339, "grad_norm": 3.1211562291161457, "learning_rate": 7.606549705035935e-09, "loss": 0.2063, "step": 301 }, { "epoch": 1.9335734293717488, "grad_norm": 3.514934313160573, "learning_rate": 6.2877901067573955e-09, "loss": 0.1871, "step": 302 }, { "epoch": 1.9399759903961584, "grad_norm": 3.080937668829297, "learning_rate": 5.094125451247655e-09, "loss": 0.2097, "step": 303 }, { "epoch": 1.9463785514205683, "grad_norm": 3.035817857990733, "learning_rate": 4.025706004760931e-09, "loss": 0.2057, "step": 304 }, { "epoch": 1.952781112444978, "grad_norm": 3.3455920299884734, "learning_rate": 3.082666266872036e-09, "loss": 0.2198, "step": 305 }, { "epoch": 1.9591836734693877, "grad_norm": 3.2826612214440845, "learning_rate": 2.2651249535439177e-09, "loss": 0.2028, "step": 306 }, { "epoch": 1.9655862344937975, "grad_norm": 3.3305034767663546, "learning_rate": 1.5731849821833953e-09, "loss": 0.2134, "step": 307 }, { "epoch": 1.9719887955182074, "grad_norm": 3.5471720967981573, "learning_rate": 1.0069334586854105e-09, "loss": 0.1908, "step": 308 }, { "epoch": 1.978391356542617, "grad_norm": 3.0004004418384542, "learning_rate": 5.664416664666882e-10, "loss": 0.2138, "step": 309 }, { "epoch": 1.9847939175670268, "grad_norm": 2.8720482666550136, "learning_rate": 2.517650574934693e-10, "loss": 0.2277, "step": 310 }, { "epoch": 1.9911964785914367, "grad_norm": 2.7009139828058832, "learning_rate": 6.29432452994294e-11, "loss": 0.215, "step": 311 }, { "epoch": 1.9975990396158463, "grad_norm": 3.2699084579180306, "learning_rate": 0.0, "loss": 0.2042, "step": 312 }, { "epoch": 1.9975990396158463, "step": 312, "total_flos": 212071482654720.0, "train_loss": 0.2767890989780426, "train_runtime": 7611.443, "train_samples_per_second": 10.506, "train_steps_per_second": 0.041 } ], "logging_steps": 1, "max_steps": 312, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 212071482654720.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }