| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.0, |
| "eval_steps": 500, |
| "global_step": 638, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "entropy": 0.725390625, |
| "epoch": 0.015698587127158554, |
| "grad_norm": 27.6445930424515, |
| "learning_rate": 1.25e-06, |
| "loss": 1.2418, |
| "mean_token_accuracy": 0.7577933847904206, |
| "num_tokens": 177927.0, |
| "step": 5 |
| }, |
| { |
| "entropy": 1.057421875, |
| "epoch": 0.03139717425431711, |
| "grad_norm": 7.557791169901141, |
| "learning_rate": 2.8125e-06, |
| "loss": 1.0793, |
| "mean_token_accuracy": 0.7529823184013367, |
| "num_tokens": 365825.0, |
| "step": 10 |
| }, |
| { |
| "entropy": 0.96953125, |
| "epoch": 0.04709576138147567, |
| "grad_norm": 7.799882301858085, |
| "learning_rate": 4.3750000000000005e-06, |
| "loss": 0.9441, |
| "mean_token_accuracy": 0.76476891040802, |
| "num_tokens": 550560.0, |
| "step": 15 |
| }, |
| { |
| "entropy": 0.983203125, |
| "epoch": 0.06279434850863422, |
| "grad_norm": 6.069209273352368, |
| "learning_rate": 5.9375e-06, |
| "loss": 0.9834, |
| "mean_token_accuracy": 0.7617899596691131, |
| "num_tokens": 731067.0, |
| "step": 20 |
| }, |
| { |
| "entropy": 0.969921875, |
| "epoch": 0.07849293563579278, |
| "grad_norm": 6.613994091501819, |
| "learning_rate": 7.500000000000001e-06, |
| "loss": 0.9232, |
| "mean_token_accuracy": 0.7665803909301758, |
| "num_tokens": 899849.0, |
| "step": 25 |
| }, |
| { |
| "entropy": 0.88515625, |
| "epoch": 0.09419152276295134, |
| "grad_norm": 5.9986199691905515, |
| "learning_rate": 9.0625e-06, |
| "loss": 0.8711, |
| "mean_token_accuracy": 0.7756525576114655, |
| "num_tokens": 1093174.0, |
| "step": 30 |
| }, |
| { |
| "entropy": 0.96953125, |
| "epoch": 0.10989010989010989, |
| "grad_norm": 6.593322862351196, |
| "learning_rate": 1.0625e-05, |
| "loss": 0.9321, |
| "mean_token_accuracy": 0.7604309499263764, |
| "num_tokens": 1275951.0, |
| "step": 35 |
| }, |
| { |
| "entropy": 0.903515625, |
| "epoch": 0.12558869701726844, |
| "grad_norm": 5.878816195096992, |
| "learning_rate": 1.2187500000000001e-05, |
| "loss": 0.9051, |
| "mean_token_accuracy": 0.7668895900249482, |
| "num_tokens": 1453554.0, |
| "step": 40 |
| }, |
| { |
| "entropy": 0.882421875, |
| "epoch": 0.141287284144427, |
| "grad_norm": 6.045022529791159, |
| "learning_rate": 1.375e-05, |
| "loss": 0.8897, |
| "mean_token_accuracy": 0.7693747282028198, |
| "num_tokens": 1622977.0, |
| "step": 45 |
| }, |
| { |
| "entropy": 0.96015625, |
| "epoch": 0.15698587127158556, |
| "grad_norm": 5.829657500936277, |
| "learning_rate": 1.5312500000000003e-05, |
| "loss": 0.9579, |
| "mean_token_accuracy": 0.7548311114311218, |
| "num_tokens": 1798487.0, |
| "step": 50 |
| }, |
| { |
| "entropy": 1.0171875, |
| "epoch": 0.1726844583987441, |
| "grad_norm": 5.908848606028448, |
| "learning_rate": 1.6875e-05, |
| "loss": 1.0203, |
| "mean_token_accuracy": 0.7450043380260467, |
| "num_tokens": 1983138.0, |
| "step": 55 |
| }, |
| { |
| "entropy": 0.962890625, |
| "epoch": 0.18838304552590268, |
| "grad_norm": 10.060274999825255, |
| "learning_rate": 1.84375e-05, |
| "loss": 0.97, |
| "mean_token_accuracy": 0.7483593642711639, |
| "num_tokens": 2170951.0, |
| "step": 60 |
| }, |
| { |
| "entropy": 1.046875, |
| "epoch": 0.20408163265306123, |
| "grad_norm": 89.6713302535496, |
| "learning_rate": 2e-05, |
| "loss": 1.034, |
| "mean_token_accuracy": 0.7365632832050324, |
| "num_tokens": 2341116.0, |
| "step": 65 |
| }, |
| { |
| "entropy": 1.0328125, |
| "epoch": 0.21978021978021978, |
| "grad_norm": 6.1550408648286625, |
| "learning_rate": 1.999625580145365e-05, |
| "loss": 1.0563, |
| "mean_token_accuracy": 0.7371429681777955, |
| "num_tokens": 2520854.0, |
| "step": 70 |
| }, |
| { |
| "entropy": 1.06796875, |
| "epoch": 0.23547880690737832, |
| "grad_norm": 11.503641001521224, |
| "learning_rate": 1.998502600961916e-05, |
| "loss": 1.0691, |
| "mean_token_accuracy": 0.7334820926189423, |
| "num_tokens": 2699173.0, |
| "step": 75 |
| }, |
| { |
| "entropy": 1.047265625, |
| "epoch": 0.25117739403453687, |
| "grad_norm": 13.475655867100802, |
| "learning_rate": 1.9966319033810575e-05, |
| "loss": 1.056, |
| "mean_token_accuracy": 0.7318085134029388, |
| "num_tokens": 2874346.0, |
| "step": 80 |
| }, |
| { |
| "entropy": 1.021875, |
| "epoch": 0.2668759811616955, |
| "grad_norm": 14.212065936594165, |
| "learning_rate": 1.9940148882554223e-05, |
| "loss": 1.0187, |
| "mean_token_accuracy": 0.7380395472049713, |
| "num_tokens": 3066471.0, |
| "step": 85 |
| }, |
| { |
| "entropy": 0.9859375, |
| "epoch": 0.282574568288854, |
| "grad_norm": 9.014318953008775, |
| "learning_rate": 1.9906535153098558e-05, |
| "loss": 1.0016, |
| "mean_token_accuracy": 0.7432572603225708, |
| "num_tokens": 3247301.0, |
| "step": 90 |
| }, |
| { |
| "entropy": 1.029296875, |
| "epoch": 0.29827315541601257, |
| "grad_norm": 5.81288792016315, |
| "learning_rate": 1.9865503016738983e-05, |
| "loss": 1.0401, |
| "mean_token_accuracy": 0.7338248550891876, |
| "num_tokens": 3442754.0, |
| "step": 95 |
| }, |
| { |
| "entropy": 0.941015625, |
| "epoch": 0.3139717425431711, |
| "grad_norm": 6.044046817780057, |
| "learning_rate": 1.9817083199968552e-05, |
| "loss": 0.9458, |
| "mean_token_accuracy": 0.7531112670898438, |
| "num_tokens": 3631756.0, |
| "step": 100 |
| }, |
| { |
| "entropy": 1.04609375, |
| "epoch": 0.32967032967032966, |
| "grad_norm": 6.0210489321664635, |
| "learning_rate": 1.9761311961468782e-05, |
| "loss": 1.0348, |
| "mean_token_accuracy": 0.7363064765930176, |
| "num_tokens": 3804931.0, |
| "step": 105 |
| }, |
| { |
| "entropy": 1.10390625, |
| "epoch": 0.3453689167974882, |
| "grad_norm": 5.33683972080787, |
| "learning_rate": 1.9698231064957695e-05, |
| "loss": 1.0934, |
| "mean_token_accuracy": 0.7270130455493927, |
| "num_tokens": 3985420.0, |
| "step": 110 |
| }, |
| { |
| "entropy": 1.03046875, |
| "epoch": 0.36106750392464676, |
| "grad_norm": 10.704522004356988, |
| "learning_rate": 1.9627887747915496e-05, |
| "loss": 1.0708, |
| "mean_token_accuracy": 0.7329379975795746, |
| "num_tokens": 4169490.0, |
| "step": 115 |
| }, |
| { |
| "entropy": 1.00390625, |
| "epoch": 0.37676609105180536, |
| "grad_norm": 7.036082714775898, |
| "learning_rate": 1.955033468621126e-05, |
| "loss": 1.0103, |
| "mean_token_accuracy": 0.7396618247032165, |
| "num_tokens": 4347650.0, |
| "step": 120 |
| }, |
| { |
| "entropy": 0.99921875, |
| "epoch": 0.3924646781789639, |
| "grad_norm": 6.818913605192643, |
| "learning_rate": 1.9465629954657185e-05, |
| "loss": 1.0001, |
| "mean_token_accuracy": 0.7393137633800506, |
| "num_tokens": 4537162.0, |
| "step": 125 |
| }, |
| { |
| "entropy": 1.0328125, |
| "epoch": 0.40816326530612246, |
| "grad_norm": 5.755556717131576, |
| "learning_rate": 1.9373836983519807e-05, |
| "loss": 1.0505, |
| "mean_token_accuracy": 0.7324462294578552, |
| "num_tokens": 4712286.0, |
| "step": 130 |
| }, |
| { |
| "entropy": 1.044140625, |
| "epoch": 0.423861852433281, |
| "grad_norm": 9.446755565244738, |
| "learning_rate": 1.927502451102095e-05, |
| "loss": 1.0619, |
| "mean_token_accuracy": 0.7303735911846161, |
| "num_tokens": 4890250.0, |
| "step": 135 |
| }, |
| { |
| "entropy": 1.11796875, |
| "epoch": 0.43956043956043955, |
| "grad_norm": 10.157882077053817, |
| "learning_rate": 1.916926653186379e-05, |
| "loss": 1.0962, |
| "mean_token_accuracy": 0.7196236491203308, |
| "num_tokens": 5065862.0, |
| "step": 140 |
| }, |
| { |
| "entropy": 0.9890625, |
| "epoch": 0.4552590266875981, |
| "grad_norm": 4.921009913523743, |
| "learning_rate": 1.905664224182269e-05, |
| "loss": 0.9884, |
| "mean_token_accuracy": 0.7450262784957886, |
| "num_tokens": 5240079.0, |
| "step": 145 |
| }, |
| { |
| "entropy": 1.028125, |
| "epoch": 0.47095761381475665, |
| "grad_norm": 6.1485435101760295, |
| "learning_rate": 1.8937235978438272e-05, |
| "loss": 1.05, |
| "mean_token_accuracy": 0.7296487390995026, |
| "num_tokens": 5415445.0, |
| "step": 150 |
| }, |
| { |
| "entropy": 0.9984375, |
| "epoch": 0.48665620094191525, |
| "grad_norm": 4.858301358373238, |
| "learning_rate": 1.8811137157862084e-05, |
| "loss": 0.9776, |
| "mean_token_accuracy": 0.7445457696914672, |
| "num_tokens": 5592814.0, |
| "step": 155 |
| }, |
| { |
| "entropy": 1.03203125, |
| "epoch": 0.5023547880690737, |
| "grad_norm": 5.7540662569760395, |
| "learning_rate": 1.8678440207898264e-05, |
| "loss": 1.0429, |
| "mean_token_accuracy": 0.7321981191635132, |
| "num_tokens": 5780889.0, |
| "step": 160 |
| }, |
| { |
| "entropy": 1.049609375, |
| "epoch": 0.5180533751962323, |
| "grad_norm": 5.198684112846989, |
| "learning_rate": 1.8539244497292248e-05, |
| "loss": 1.04, |
| "mean_token_accuracy": 0.7295248687267304, |
| "num_tokens": 5965218.0, |
| "step": 165 |
| }, |
| { |
| "entropy": 0.9890625, |
| "epoch": 0.533751962323391, |
| "grad_norm": 5.232818269013678, |
| "learning_rate": 1.8393654261319504e-05, |
| "loss": 0.9886, |
| "mean_token_accuracy": 0.7405453681945801, |
| "num_tokens": 6136493.0, |
| "step": 170 |
| }, |
| { |
| "entropy": 1.086328125, |
| "epoch": 0.5494505494505495, |
| "grad_norm": 5.545204163745446, |
| "learning_rate": 1.8241778523729997e-05, |
| "loss": 1.0636, |
| "mean_token_accuracy": 0.7313859045505524, |
| "num_tokens": 6325624.0, |
| "step": 175 |
| }, |
| { |
| "entropy": 1.057421875, |
| "epoch": 0.565149136577708, |
| "grad_norm": 6.499024191424293, |
| "learning_rate": 1.8083731015106916e-05, |
| "loss": 1.0735, |
| "mean_token_accuracy": 0.7249915122985839, |
| "num_tokens": 6515331.0, |
| "step": 180 |
| }, |
| { |
| "entropy": 1.0359375, |
| "epoch": 0.5808477237048666, |
| "grad_norm": 5.040936506275316, |
| "learning_rate": 1.7919630087700672e-05, |
| "loss": 1.0775, |
| "mean_token_accuracy": 0.7272228896617889, |
| "num_tokens": 6696076.0, |
| "step": 185 |
| }, |
| { |
| "entropy": 1.06953125, |
| "epoch": 0.5965463108320251, |
| "grad_norm": 4.848467550570556, |
| "learning_rate": 1.7749598626802028e-05, |
| "loss": 1.0442, |
| "mean_token_accuracy": 0.7348412156105042, |
| "num_tokens": 6878935.0, |
| "step": 190 |
| }, |
| { |
| "entropy": 1.025, |
| "epoch": 0.6122448979591837, |
| "grad_norm": 5.087731121093467, |
| "learning_rate": 1.7573763958720736e-05, |
| "loss": 1.0188, |
| "mean_token_accuracy": 0.7360713183879852, |
| "num_tokens": 7061314.0, |
| "step": 195 |
| }, |
| { |
| "entropy": 1.015625, |
| "epoch": 0.6279434850863422, |
| "grad_norm": 9.74435890233205, |
| "learning_rate": 1.7392257755438516e-05, |
| "loss": 1.0072, |
| "mean_token_accuracy": 0.738730925321579, |
| "num_tokens": 7242353.0, |
| "step": 200 |
| }, |
| { |
| "entropy": 0.999609375, |
| "epoch": 0.6436420722135008, |
| "grad_norm": 15.98222584676736, |
| "learning_rate": 1.720521593600787e-05, |
| "loss": 0.9944, |
| "mean_token_accuracy": 0.7411578953266144, |
| "num_tokens": 7425295.0, |
| "step": 205 |
| }, |
| { |
| "entropy": 1.06953125, |
| "epoch": 0.6593406593406593, |
| "grad_norm": 5.835834203383889, |
| "learning_rate": 1.7012778564770484e-05, |
| "loss": 1.0383, |
| "mean_token_accuracy": 0.7306137382984161, |
| "num_tokens": 7584824.0, |
| "step": 210 |
| }, |
| { |
| "entropy": 0.99375, |
| "epoch": 0.6750392464678179, |
| "grad_norm": 4.8622795054153265, |
| "learning_rate": 1.6815089746471472e-05, |
| "loss": 1.0118, |
| "mean_token_accuracy": 0.742330664396286, |
| "num_tokens": 7768228.0, |
| "step": 215 |
| }, |
| { |
| "entropy": 1.03203125, |
| "epoch": 0.6907378335949764, |
| "grad_norm": 4.668555704536192, |
| "learning_rate": 1.6612297518348072e-05, |
| "loss": 1.0245, |
| "mean_token_accuracy": 0.7361457228660584, |
| "num_tokens": 7949122.0, |
| "step": 220 |
| }, |
| { |
| "entropy": 1.09453125, |
| "epoch": 0.706436420722135, |
| "grad_norm": 4.6567337129267266, |
| "learning_rate": 1.6404553739273426e-05, |
| "loss": 1.1086, |
| "mean_token_accuracy": 0.7151965975761414, |
| "num_tokens": 8123662.0, |
| "step": 225 |
| }, |
| { |
| "entropy": 1.0109375, |
| "epoch": 0.7221350078492935, |
| "grad_norm": 5.00422795457061, |
| "learning_rate": 1.6192013976038663e-05, |
| "loss": 1.0294, |
| "mean_token_accuracy": 0.7341724216938019, |
| "num_tokens": 8312360.0, |
| "step": 230 |
| }, |
| { |
| "entropy": 1.028125, |
| "epoch": 0.7378335949764521, |
| "grad_norm": 5.388442787487355, |
| "learning_rate": 1.597483738685829e-05, |
| "loss": 1.0036, |
| "mean_token_accuracy": 0.7365865647792816, |
| "num_tokens": 8492020.0, |
| "step": 235 |
| }, |
| { |
| "entropy": 1.063671875, |
| "epoch": 0.7535321821036107, |
| "grad_norm": 6.334463361903077, |
| "learning_rate": 1.5753186602186207e-05, |
| "loss": 1.0574, |
| "mean_token_accuracy": 0.7286219894886017, |
| "num_tokens": 8665809.0, |
| "step": 240 |
| }, |
| { |
| "entropy": 1.017578125, |
| "epoch": 0.7692307692307693, |
| "grad_norm": 5.970482979437965, |
| "learning_rate": 1.552722760293157e-05, |
| "loss": 1.0271, |
| "mean_token_accuracy": 0.7386983633041382, |
| "num_tokens": 8847168.0, |
| "step": 245 |
| }, |
| { |
| "entropy": 0.987109375, |
| "epoch": 0.7849293563579278, |
| "grad_norm": 5.1621550529599585, |
| "learning_rate": 1.5297129596165684e-05, |
| "loss": 0.9863, |
| "mean_token_accuracy": 0.7422987043857574, |
| "num_tokens": 9025797.0, |
| "step": 250 |
| }, |
| { |
| "entropy": 1.087109375, |
| "epoch": 0.8006279434850864, |
| "grad_norm": 5.452415452645883, |
| "learning_rate": 1.5063064888413048e-05, |
| "loss": 1.0758, |
| "mean_token_accuracy": 0.7265674948692322, |
| "num_tokens": 9207096.0, |
| "step": 255 |
| }, |
| { |
| "entropy": 1.03203125, |
| "epoch": 0.8163265306122449, |
| "grad_norm": 4.632036015662455, |
| "learning_rate": 1.4825208756621354e-05, |
| "loss": 1.0237, |
| "mean_token_accuracy": 0.7348480224609375, |
| "num_tokens": 9388986.0, |
| "step": 260 |
| }, |
| { |
| "entropy": 1.013671875, |
| "epoch": 0.8320251177394035, |
| "grad_norm": 5.170243707129603, |
| "learning_rate": 1.4583739316907188e-05, |
| "loss": 1.0082, |
| "mean_token_accuracy": 0.7390485882759095, |
| "num_tokens": 9574926.0, |
| "step": 265 |
| }, |
| { |
| "entropy": 1.047265625, |
| "epoch": 0.847723704866562, |
| "grad_norm": 4.549399122747884, |
| "learning_rate": 1.4338837391175582e-05, |
| "loss": 1.0387, |
| "mean_token_accuracy": 0.7376436531543732, |
| "num_tokens": 9741165.0, |
| "step": 270 |
| }, |
| { |
| "entropy": 0.97578125, |
| "epoch": 0.8634222919937206, |
| "grad_norm": 4.415042132554178, |
| "learning_rate": 1.4090686371713403e-05, |
| "loss": 0.977, |
| "mean_token_accuracy": 0.7484280169010162, |
| "num_tokens": 9932402.0, |
| "step": 275 |
| }, |
| { |
| "entropy": 0.996484375, |
| "epoch": 0.8791208791208791, |
| "grad_norm": 4.520054645554902, |
| "learning_rate": 1.3839472083857912e-05, |
| "loss": 0.9802, |
| "mean_token_accuracy": 0.7440161406993866, |
| "num_tokens": 10116131.0, |
| "step": 280 |
| }, |
| { |
| "entropy": 1.01328125, |
| "epoch": 0.8948194662480377, |
| "grad_norm": 4.353453268472847, |
| "learning_rate": 1.3585382646843396e-05, |
| "loss": 1.0053, |
| "mean_token_accuracy": 0.7423594474792481, |
| "num_tokens": 10301345.0, |
| "step": 285 |
| }, |
| { |
| "entropy": 0.94609375, |
| "epoch": 0.9105180533751962, |
| "grad_norm": 4.478156722785245, |
| "learning_rate": 1.332860833293e-05, |
| "loss": 0.9375, |
| "mean_token_accuracy": 0.752207487821579, |
| "num_tokens": 10487099.0, |
| "step": 290 |
| }, |
| { |
| "entropy": 0.991796875, |
| "epoch": 0.9262166405023547, |
| "grad_norm": 5.467454719870752, |
| "learning_rate": 1.3069341424920301e-05, |
| "loss": 0.9833, |
| "mean_token_accuracy": 0.7454055905342102, |
| "num_tokens": 10668291.0, |
| "step": 295 |
| }, |
| { |
| "entropy": 0.991796875, |
| "epoch": 0.9419152276295133, |
| "grad_norm": 5.779761643334634, |
| "learning_rate": 1.280777607217031e-05, |
| "loss": 1.0012, |
| "mean_token_accuracy": 0.7384802579879761, |
| "num_tokens": 10838776.0, |
| "step": 300 |
| }, |
| { |
| "entropy": 1.019140625, |
| "epoch": 0.957613814756672, |
| "grad_norm": 6.047537199685854, |
| "learning_rate": 1.2544108145202748e-05, |
| "loss": 1.0125, |
| "mean_token_accuracy": 0.7406556665897369, |
| "num_tokens": 11015732.0, |
| "step": 305 |
| }, |
| { |
| "entropy": 0.985546875, |
| "epoch": 0.9733124018838305, |
| "grad_norm": 4.609576682854215, |
| "learning_rate": 1.2278535089031377e-05, |
| "loss": 0.9779, |
| "mean_token_accuracy": 0.7456105411052704, |
| "num_tokens": 11215029.0, |
| "step": 310 |
| }, |
| { |
| "entropy": 0.979296875, |
| "epoch": 0.989010989010989, |
| "grad_norm": 4.594430034893244, |
| "learning_rate": 1.2011255775306378e-05, |
| "loss": 0.9851, |
| "mean_token_accuracy": 0.7425277471542359, |
| "num_tokens": 11397796.0, |
| "step": 315 |
| }, |
| { |
| "entropy": 0.9288194444444444, |
| "epoch": 1.0031397174254317, |
| "grad_norm": 5.271756092829798, |
| "learning_rate": 1.1742470353391329e-05, |
| "loss": 0.8636, |
| "mean_token_accuracy": 0.7742305133077834, |
| "num_tokens": 11566189.0, |
| "step": 320 |
| }, |
| { |
| "entropy": 0.527734375, |
| "epoch": 1.0188383045525902, |
| "grad_norm": 4.26050787407313, |
| "learning_rate": 1.1472380100483438e-05, |
| "loss": 0.5265, |
| "mean_token_accuracy": 0.8555980503559113, |
| "num_tokens": 11763172.0, |
| "step": 325 |
| }, |
| { |
| "entropy": 0.6, |
| "epoch": 1.0345368916797488, |
| "grad_norm": 4.175751994504683, |
| "learning_rate": 1.1201187270889166e-05, |
| "loss": 0.5485, |
| "mean_token_accuracy": 0.8497718095779419, |
| "num_tokens": 11945459.0, |
| "step": 330 |
| }, |
| { |
| "entropy": 0.5111328125, |
| "epoch": 1.0502354788069075, |
| "grad_norm": 4.128005949558796, |
| "learning_rate": 1.0929094944568182e-05, |
| "loss": 0.4993, |
| "mean_token_accuracy": 0.862428092956543, |
| "num_tokens": 12128965.0, |
| "step": 335 |
| }, |
| { |
| "entropy": 0.6125, |
| "epoch": 1.065934065934066, |
| "grad_norm": 3.837079491617288, |
| "learning_rate": 1.0656306875059024e-05, |
| "loss": 0.5625, |
| "mean_token_accuracy": 0.8477550685405731, |
| "num_tokens": 12328856.0, |
| "step": 340 |
| }, |
| { |
| "entropy": 0.550390625, |
| "epoch": 1.0816326530612246, |
| "grad_norm": 5.12821302444939, |
| "learning_rate": 1.0383027336900356e-05, |
| "loss": 0.5442, |
| "mean_token_accuracy": 0.8556217610836029, |
| "num_tokens": 12504085.0, |
| "step": 345 |
| }, |
| { |
| "entropy": 0.608203125, |
| "epoch": 1.097331240188383, |
| "grad_norm": 4.192313568974114, |
| "learning_rate": 1.0109460972662081e-05, |
| "loss": 0.5784, |
| "mean_token_accuracy": 0.8412099361419678, |
| "num_tokens": 12682650.0, |
| "step": 350 |
| }, |
| { |
| "entropy": 0.5048828125, |
| "epoch": 1.1130298273155417, |
| "grad_norm": 4.016027796950457, |
| "learning_rate": 9.835812639700862e-06, |
| "loss": 0.4762, |
| "mean_token_accuracy": 0.8649287343025207, |
| "num_tokens": 12870774.0, |
| "step": 355 |
| }, |
| { |
| "entropy": 0.5435546875, |
| "epoch": 1.1287284144427001, |
| "grad_norm": 4.461511861583663, |
| "learning_rate": 9.562287256754791e-06, |
| "loss": 0.5157, |
| "mean_token_accuracy": 0.8568434357643128, |
| "num_tokens": 13058291.0, |
| "step": 360 |
| }, |
| { |
| "entropy": 0.526953125, |
| "epoch": 1.1444270015698588, |
| "grad_norm": 3.960694420334278, |
| "learning_rate": 9.289089650492119e-06, |
| "loss": 0.4905, |
| "mean_token_accuracy": 0.8625445365905762, |
| "num_tokens": 13239182.0, |
| "step": 365 |
| }, |
| { |
| "entropy": 0.5201171875, |
| "epoch": 1.1601255886970172, |
| "grad_norm": 3.936087366602336, |
| "learning_rate": 9.016424402128891e-06, |
| "loss": 0.5054, |
| "mean_token_accuracy": 0.8619497656822205, |
| "num_tokens": 13421892.0, |
| "step": 370 |
| }, |
| { |
| "entropy": 0.58828125, |
| "epoch": 1.1758241758241759, |
| "grad_norm": 3.9961031282423227, |
| "learning_rate": 8.744495694230413e-06, |
| "loss": 0.5439, |
| "mean_token_accuracy": 0.8468229651451111, |
| "num_tokens": 13593687.0, |
| "step": 375 |
| }, |
| { |
| "entropy": 0.4953125, |
| "epoch": 1.1915227629513343, |
| "grad_norm": 4.322236217725683, |
| "learning_rate": 8.473507157811254e-06, |
| "loss": 0.4683, |
| "mean_token_accuracy": 0.8697527587413788, |
| "num_tokens": 13780570.0, |
| "step": 380 |
| }, |
| { |
| "entropy": 0.529296875, |
| "epoch": 1.207221350078493, |
| "grad_norm": 5.141990553056302, |
| "learning_rate": 8.203661719848249e-06, |
| "loss": 0.4863, |
| "mean_token_accuracy": 0.8653198599815368, |
| "num_tokens": 13968713.0, |
| "step": 385 |
| }, |
| { |
| "entropy": 0.5470703125, |
| "epoch": 1.2229199372056514, |
| "grad_norm": 3.842210785861936, |
| "learning_rate": 7.935161451320696e-06, |
| "loss": 0.5175, |
| "mean_token_accuracy": 0.858714509010315, |
| "num_tokens": 14142058.0, |
| "step": 390 |
| }, |
| { |
| "entropy": 0.553125, |
| "epoch": 1.23861852433281, |
| "grad_norm": 5.339381588516691, |
| "learning_rate": 7.668207415891625e-06, |
| "loss": 0.534, |
| "mean_token_accuracy": 0.8537694931030273, |
| "num_tokens": 14324073.0, |
| "step": 395 |
| }, |
| { |
| "entropy": 0.539453125, |
| "epoch": 1.2543171114599687, |
| "grad_norm": 4.116642693044476, |
| "learning_rate": 7.402999519343319e-06, |
| "loss": 0.501, |
| "mean_token_accuracy": 0.8621095418930054, |
| "num_tokens": 14493070.0, |
| "step": 400 |
| }, |
| { |
| "entropy": 0.50625, |
| "epoch": 1.2700156985871272, |
| "grad_norm": 3.8927696021633973, |
| "learning_rate": 7.139736359879916e-06, |
| "loss": 0.4792, |
| "mean_token_accuracy": 0.8644804239273072, |
| "num_tokens": 14667152.0, |
| "step": 405 |
| }, |
| { |
| "entropy": 0.5326171875, |
| "epoch": 1.2857142857142856, |
| "grad_norm": 3.9499973554079313, |
| "learning_rate": 6.878615079409221e-06, |
| "loss": 0.4959, |
| "mean_token_accuracy": 0.8623277962207794, |
| "num_tokens": 14841585.0, |
| "step": 410 |
| }, |
| { |
| "entropy": 0.52265625, |
| "epoch": 1.3014128728414442, |
| "grad_norm": 4.630623780872735, |
| "learning_rate": 6.619831215914974e-06, |
| "loss": 0.4879, |
| "mean_token_accuracy": 0.8631112694740295, |
| "num_tokens": 15018117.0, |
| "step": 415 |
| }, |
| { |
| "entropy": 0.534765625, |
| "epoch": 1.317111459968603, |
| "grad_norm": 4.099657499983856, |
| "learning_rate": 6.363578557030285e-06, |
| "loss": 0.5156, |
| "mean_token_accuracy": 0.8561935067176819, |
| "num_tokens": 15198528.0, |
| "step": 420 |
| }, |
| { |
| "entropy": 0.540234375, |
| "epoch": 1.3328100470957613, |
| "grad_norm": 4.639376931843806, |
| "learning_rate": 6.110048994921735e-06, |
| "loss": 0.5195, |
| "mean_token_accuracy": 0.8552192032337189, |
| "num_tokens": 15368254.0, |
| "step": 425 |
| }, |
| { |
| "entropy": 0.564453125, |
| "epoch": 1.34850863422292, |
| "grad_norm": 3.6122566097775257, |
| "learning_rate": 5.859432382592895e-06, |
| "loss": 0.5143, |
| "mean_token_accuracy": 0.8563300728797912, |
| "num_tokens": 15537168.0, |
| "step": 430 |
| }, |
| { |
| "entropy": 0.5, |
| "epoch": 1.3642072213500784, |
| "grad_norm": 4.097499019050656, |
| "learning_rate": 5.611916391714887e-06, |
| "loss": 0.4718, |
| "mean_token_accuracy": 0.868126118183136, |
| "num_tokens": 15712027.0, |
| "step": 435 |
| }, |
| { |
| "entropy": 0.512890625, |
| "epoch": 1.379905808477237, |
| "grad_norm": 3.8541646824269042, |
| "learning_rate": 5.367686372090359e-06, |
| "loss": 0.4784, |
| "mean_token_accuracy": 0.8659977853298187, |
| "num_tokens": 15895860.0, |
| "step": 440 |
| }, |
| { |
| "entropy": 0.5048828125, |
| "epoch": 1.3956043956043955, |
| "grad_norm": 3.8898545956491763, |
| "learning_rate": 5.126925212856202e-06, |
| "loss": 0.4616, |
| "mean_token_accuracy": 0.8711130917072296, |
| "num_tokens": 16076742.0, |
| "step": 445 |
| }, |
| { |
| "entropy": 0.4904296875, |
| "epoch": 1.4113029827315542, |
| "grad_norm": 4.486423301199419, |
| "learning_rate": 4.889813205528895e-06, |
| "loss": 0.4697, |
| "mean_token_accuracy": 0.8675071418285369, |
| "num_tokens": 16253720.0, |
| "step": 450 |
| }, |
| { |
| "entropy": 0.522265625, |
| "epoch": 1.4270015698587128, |
| "grad_norm": 4.353150769090539, |
| "learning_rate": 4.65652790899508e-06, |
| "loss": 0.4806, |
| "mean_token_accuracy": 0.8667589604854584, |
| "num_tokens": 16430030.0, |
| "step": 455 |
| }, |
| { |
| "entropy": 0.508203125, |
| "epoch": 1.4427001569858713, |
| "grad_norm": 3.724503033243514, |
| "learning_rate": 4.427244016548375e-06, |
| "loss": 0.4856, |
| "mean_token_accuracy": 0.8651303589344025, |
| "num_tokens": 16620012.0, |
| "step": 460 |
| }, |
| { |
| "entropy": 0.5498046875, |
| "epoch": 1.4583987441130297, |
| "grad_norm": 3.8968644379129165, |
| "learning_rate": 4.202133225072153e-06, |
| "loss": 0.5123, |
| "mean_token_accuracy": 0.8566905677318573, |
| "num_tokens": 16799772.0, |
| "step": 465 |
| }, |
| { |
| "entropy": 0.5142578125, |
| "epoch": 1.4740973312401884, |
| "grad_norm": 4.227950531214132, |
| "learning_rate": 3.9813641064660525e-06, |
| "loss": 0.4776, |
| "mean_token_accuracy": 0.8667101263999939, |
| "num_tokens": 16980171.0, |
| "step": 470 |
| }, |
| { |
| "entropy": 0.4716796875, |
| "epoch": 1.489795918367347, |
| "grad_norm": 3.7526226508520275, |
| "learning_rate": 3.7651019814126656e-06, |
| "loss": 0.4512, |
| "mean_token_accuracy": 0.8713797450065612, |
| "num_tokens": 17159904.0, |
| "step": 475 |
| }, |
| { |
| "entropy": 0.481640625, |
| "epoch": 1.5054945054945055, |
| "grad_norm": 3.6868580303818583, |
| "learning_rate": 3.5535087955788396e-06, |
| "loss": 0.4425, |
| "mean_token_accuracy": 0.8738309502601623, |
| "num_tokens": 17337452.0, |
| "step": 480 |
| }, |
| { |
| "entropy": 0.5099609375, |
| "epoch": 1.521193092621664, |
| "grad_norm": 3.695453877176934, |
| "learning_rate": 3.3467429983443477e-06, |
| "loss": 0.4807, |
| "mean_token_accuracy": 0.8654368996620179, |
| "num_tokens": 17524548.0, |
| "step": 485 |
| }, |
| { |
| "entropy": 0.4744140625, |
| "epoch": 1.5368916797488226, |
| "grad_norm": 3.54819349493318, |
| "learning_rate": 3.144959424148666e-06, |
| "loss": 0.4421, |
| "mean_token_accuracy": 0.8751842856407166, |
| "num_tokens": 17722299.0, |
| "step": 490 |
| }, |
| { |
| "entropy": 0.488671875, |
| "epoch": 1.5525902668759812, |
| "grad_norm": 3.8797108965135285, |
| "learning_rate": 2.9483091765448426e-06, |
| "loss": 0.453, |
| "mean_token_accuracy": 0.8750694751739502, |
| "num_tokens": 17902084.0, |
| "step": 495 |
| }, |
| { |
| "entropy": 0.481640625, |
| "epoch": 1.5682888540031397, |
| "grad_norm": 3.992381863433219, |
| "learning_rate": 2.756939515047108e-06, |
| "loss": 0.4511, |
| "mean_token_accuracy": 0.8725055634975434, |
| "num_tokens": 18079745.0, |
| "step": 500 |
| }, |
| { |
| "entropy": 0.503125, |
| "epoch": 1.5839874411302983, |
| "grad_norm": 3.6288480240477794, |
| "learning_rate": 2.570993744857151e-06, |
| "loss": 0.4688, |
| "mean_token_accuracy": 0.8687343716621398, |
| "num_tokens": 18258975.0, |
| "step": 505 |
| }, |
| { |
| "entropy": 0.49375, |
| "epoch": 1.599686028257457, |
| "grad_norm": 3.9138568832414684, |
| "learning_rate": 2.390611109551456e-06, |
| "loss": 0.4576, |
| "mean_token_accuracy": 0.8736713230609894, |
| "num_tokens": 18436950.0, |
| "step": 510 |
| }, |
| { |
| "entropy": 0.4779296875, |
| "epoch": 1.6153846153846154, |
| "grad_norm": 3.974668183681603, |
| "learning_rate": 2.215926686810206e-06, |
| "loss": 0.4624, |
| "mean_token_accuracy": 0.8699750483036042, |
| "num_tokens": 18612816.0, |
| "step": 515 |
| }, |
| { |
| "entropy": 0.46953125, |
| "epoch": 1.6310832025117739, |
| "grad_norm": 3.417678338297766, |
| "learning_rate": 2.047071287265735e-06, |
| "loss": 0.4274, |
| "mean_token_accuracy": 0.8788953125476837, |
| "num_tokens": 18779655.0, |
| "step": 520 |
| }, |
| { |
| "entropy": 0.512109375, |
| "epoch": 1.6467817896389325, |
| "grad_norm": 4.3168528799542365, |
| "learning_rate": 1.8841713565463548e-06, |
| "loss": 0.4806, |
| "mean_token_accuracy": 0.8664675652980804, |
| "num_tokens": 18963728.0, |
| "step": 525 |
| }, |
| { |
| "entropy": 0.521484375, |
| "epoch": 1.6624803767660912, |
| "grad_norm": 3.8378909961117746, |
| "learning_rate": 1.727348880588815e-06, |
| "loss": 0.4898, |
| "mean_token_accuracy": 0.864307701587677, |
| "num_tokens": 19155989.0, |
| "step": 530 |
| }, |
| { |
| "entropy": 0.5279296875, |
| "epoch": 1.6781789638932496, |
| "grad_norm": 3.7445661009980395, |
| "learning_rate": 1.5767212942904275e-06, |
| "loss": 0.4797, |
| "mean_token_accuracy": 0.8675626039505004, |
| "num_tokens": 19331660.0, |
| "step": 535 |
| }, |
| { |
| "entropy": 0.49140625, |
| "epoch": 1.693877551020408, |
| "grad_norm": 3.57312597751139, |
| "learning_rate": 1.4324013935691205e-06, |
| "loss": 0.4652, |
| "mean_token_accuracy": 0.8723213970661163, |
| "num_tokens": 19506633.0, |
| "step": 540 |
| }, |
| { |
| "entropy": 0.4623046875, |
| "epoch": 1.7095761381475667, |
| "grad_norm": 3.8004849150443754, |
| "learning_rate": 1.2944972508973908e-06, |
| "loss": 0.4387, |
| "mean_token_accuracy": 0.8761039733886719, |
| "num_tokens": 19687893.0, |
| "step": 545 |
| }, |
| { |
| "entropy": 0.4958984375, |
| "epoch": 1.7252747252747254, |
| "grad_norm": 3.7246541942244846, |
| "learning_rate": 1.1631121343733443e-06, |
| "loss": 0.457, |
| "mean_token_accuracy": 0.8720730483531952, |
| "num_tokens": 19861818.0, |
| "step": 550 |
| }, |
| { |
| "entropy": 0.5181640625, |
| "epoch": 1.7409733124018838, |
| "grad_norm": 3.771046430655077, |
| "learning_rate": 1.0383444303894453e-06, |
| "loss": 0.4931, |
| "mean_token_accuracy": 0.8654853940010071, |
| "num_tokens": 20051183.0, |
| "step": 555 |
| }, |
| { |
| "entropy": 0.4501953125, |
| "epoch": 1.7566718995290422, |
| "grad_norm": 3.136760621936483, |
| "learning_rate": 9.202875699568636e-07, |
| "loss": 0.4119, |
| "mean_token_accuracy": 0.8824858725070953, |
| "num_tokens": 20240768.0, |
| "step": 560 |
| }, |
| { |
| "entropy": 0.4869140625, |
| "epoch": 1.772370486656201, |
| "grad_norm": 3.821451067956206, |
| "learning_rate": 8.090299587406514e-07, |
| "loss": 0.4514, |
| "mean_token_accuracy": 0.8730863869190216, |
| "num_tokens": 20418509.0, |
| "step": 565 |
| }, |
| { |
| "entropy": 0.47734375, |
| "epoch": 1.7880690737833596, |
| "grad_norm": 3.8388982673994567, |
| "learning_rate": 7.04654910858038e-07, |
| "loss": 0.4506, |
| "mean_token_accuracy": 0.8735210478305817, |
| "num_tokens": 20598161.0, |
| "step": 570 |
| }, |
| { |
| "entropy": 0.45625, |
| "epoch": 1.803767660910518, |
| "grad_norm": 3.9183565034692465, |
| "learning_rate": 6.072405864895403e-07, |
| "loss": 0.4307, |
| "mean_token_accuracy": 0.8798979103565217, |
| "num_tokens": 20779725.0, |
| "step": 575 |
| }, |
| { |
| "entropy": 0.4697265625, |
| "epoch": 1.8194662480376766, |
| "grad_norm": 4.022575031872933, |
| "learning_rate": 5.16859933349495e-07, |
| "loss": 0.4312, |
| "mean_token_accuracy": 0.8792614817619324, |
| "num_tokens": 20948735.0, |
| "step": 580 |
| }, |
| { |
| "entropy": 0.4603515625, |
| "epoch": 1.8351648351648353, |
| "grad_norm": 3.4815300156968076, |
| "learning_rate": 4.335806320599234e-07, |
| "loss": 0.4257, |
| "mean_token_accuracy": 0.8804711699485779, |
| "num_tokens": 21121016.0, |
| "step": 585 |
| }, |
| { |
| "entropy": 0.4689453125, |
| "epoch": 1.8508634222919937, |
| "grad_norm": 3.4871494725317707, |
| "learning_rate": 3.574650454685902e-07, |
| "loss": 0.4385, |
| "mean_token_accuracy": 0.8778711676597595, |
| "num_tokens": 21311277.0, |
| "step": 590 |
| }, |
| { |
| "entropy": 0.4796875, |
| "epoch": 1.8665620094191522, |
| "grad_norm": 3.882125284807203, |
| "learning_rate": 2.8857017194923174e-07, |
| "loss": 0.4416, |
| "mean_token_accuracy": 0.8744481921195983, |
| "num_tokens": 21491007.0, |
| "step": 595 |
| }, |
| { |
| "entropy": 0.48984375, |
| "epoch": 1.8822605965463108, |
| "grad_norm": 3.5907652067766356, |
| "learning_rate": 2.2694760271890215e-07, |
| "loss": 0.46, |
| "mean_token_accuracy": 0.8710661828517914, |
| "num_tokens": 21670814.0, |
| "step": 600 |
| }, |
| { |
| "entropy": 0.4677734375, |
| "epoch": 1.8979591836734695, |
| "grad_norm": 3.5415128539129705, |
| "learning_rate": 1.7264348320442992e-07, |
| "loss": 0.4244, |
| "mean_token_accuracy": 0.8819806277751923, |
| "num_tokens": 21855006.0, |
| "step": 605 |
| }, |
| { |
| "entropy": 0.5015625, |
| "epoch": 1.913657770800628, |
| "grad_norm": 4.003861542545578, |
| "learning_rate": 1.256984784868842e-07, |
| "loss": 0.4683, |
| "mean_token_accuracy": 0.8715729892253876, |
| "num_tokens": 22033513.0, |
| "step": 610 |
| }, |
| { |
| "entropy": 0.470703125, |
| "epoch": 1.9293563579277864, |
| "grad_norm": 3.119159854744513, |
| "learning_rate": 8.614774284994797e-08, |
| "loss": 0.443, |
| "mean_token_accuracy": 0.8770529448986053, |
| "num_tokens": 22220738.0, |
| "step": 615 |
| }, |
| { |
| "entropy": 0.455078125, |
| "epoch": 1.945054945054945, |
| "grad_norm": 3.873386527152864, |
| "learning_rate": 5.402089345499795e-08, |
| "loss": 0.4261, |
| "mean_token_accuracy": 0.8815056264400483, |
| "num_tokens": 22411420.0, |
| "step": 620 |
| }, |
| { |
| "entropy": 0.448828125, |
| "epoch": 1.9607535321821037, |
| "grad_norm": 3.503514019607563, |
| "learning_rate": 2.9341988162595593e-08, |
| "loss": 0.4299, |
| "mean_token_accuracy": 0.8768543183803559, |
| "num_tokens": 22594533.0, |
| "step": 625 |
| }, |
| { |
| "entropy": 0.49296875, |
| "epoch": 1.9764521193092621, |
| "grad_norm": 3.6439296442719877, |
| "learning_rate": 1.2129507517003591e-08, |
| "loss": 0.4646, |
| "mean_token_accuracy": 0.8703228771686554, |
| "num_tokens": 22774333.0, |
| "step": 630 |
| }, |
| { |
| "entropy": 0.4900390625, |
| "epoch": 1.9921507064364206, |
| "grad_norm": 3.4854702032469973, |
| "learning_rate": 2.396340907225847e-09, |
| "loss": 0.46, |
| "mean_token_accuracy": 0.8727392196655274, |
| "num_tokens": 22959536.0, |
| "step": 635 |
| }, |
| { |
| "entropy": 0.5125, |
| "epoch": 2.0, |
| "mean_token_accuracy": 0.8682946562767029, |
| "num_tokens": 23058558.0, |
| "step": 638, |
| "total_flos": 77930701455360.0, |
| "train_loss": 0.7463928270489445, |
| "train_runtime": 1636.8404, |
| "train_samples_per_second": 24.899, |
| "train_steps_per_second": 0.39 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 638, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 100, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 77930701455360.0, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|