| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 5.0, | |
| "eval_steps": 500, | |
| "global_step": 100, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 1.3193361535668373, | |
| "epoch": 0.051118210862619806, | |
| "grad_norm": 0.8119011521339417, | |
| "learning_rate": 0.0, | |
| "loss": 1.692, | |
| "mean_token_accuracy": 0.654717817902565, | |
| "num_tokens": 133947.0, | |
| "step": 1 | |
| }, | |
| { | |
| "entropy": 1.3207192197442055, | |
| "epoch": 0.10223642172523961, | |
| "grad_norm": 0.8001739382743835, | |
| "learning_rate": 2.0000000000000003e-06, | |
| "loss": 1.6938, | |
| "mean_token_accuracy": 0.6540814265608788, | |
| "num_tokens": 267949.0, | |
| "step": 2 | |
| }, | |
| { | |
| "entropy": 1.3139144703745842, | |
| "epoch": 0.15335463258785942, | |
| "grad_norm": 0.8021382689476013, | |
| "learning_rate": 4.000000000000001e-06, | |
| "loss": 1.6875, | |
| "mean_token_accuracy": 0.6542951986193657, | |
| "num_tokens": 402435.0, | |
| "step": 3 | |
| }, | |
| { | |
| "entropy": 1.3238477781414986, | |
| "epoch": 0.20447284345047922, | |
| "grad_norm": 0.8046473264694214, | |
| "learning_rate": 6e-06, | |
| "loss": 1.6979, | |
| "mean_token_accuracy": 0.6523041352629662, | |
| "num_tokens": 536339.0, | |
| "step": 4 | |
| }, | |
| { | |
| "entropy": 1.3151762038469315, | |
| "epoch": 0.25559105431309903, | |
| "grad_norm": 0.7999162077903748, | |
| "learning_rate": 8.000000000000001e-06, | |
| "loss": 1.6884, | |
| "mean_token_accuracy": 0.6541883014142513, | |
| "num_tokens": 670396.0, | |
| "step": 5 | |
| }, | |
| { | |
| "entropy": 1.3171968907117844, | |
| "epoch": 0.30670926517571884, | |
| "grad_norm": 0.8070191740989685, | |
| "learning_rate": 1e-05, | |
| "loss": 1.6881, | |
| "mean_token_accuracy": 0.6536356993019581, | |
| "num_tokens": 804575.0, | |
| "step": 6 | |
| }, | |
| { | |
| "entropy": 1.3177252262830734, | |
| "epoch": 0.35782747603833864, | |
| "grad_norm": 0.8115559220314026, | |
| "learning_rate": 1.2e-05, | |
| "loss": 1.6852, | |
| "mean_token_accuracy": 0.6539545804262161, | |
| "num_tokens": 938351.0, | |
| "step": 7 | |
| }, | |
| { | |
| "entropy": 1.3142458871006966, | |
| "epoch": 0.40894568690095845, | |
| "grad_norm": 0.809145987033844, | |
| "learning_rate": 1.4e-05, | |
| "loss": 1.6739, | |
| "mean_token_accuracy": 0.6544475704431534, | |
| "num_tokens": 1072655.0, | |
| "step": 8 | |
| }, | |
| { | |
| "entropy": 1.3176514655351639, | |
| "epoch": 0.46006389776357826, | |
| "grad_norm": 0.8079097270965576, | |
| "learning_rate": 1.6000000000000003e-05, | |
| "loss": 1.6733, | |
| "mean_token_accuracy": 0.6541223935782909, | |
| "num_tokens": 1206472.0, | |
| "step": 9 | |
| }, | |
| { | |
| "entropy": 1.3175865784287453, | |
| "epoch": 0.5111821086261981, | |
| "grad_norm": 0.8009534478187561, | |
| "learning_rate": 1.8e-05, | |
| "loss": 1.657, | |
| "mean_token_accuracy": 0.655005007982254, | |
| "num_tokens": 1340462.0, | |
| "step": 10 | |
| }, | |
| { | |
| "entropy": 1.3152535259723663, | |
| "epoch": 0.5623003194888179, | |
| "grad_norm": 0.7782304883003235, | |
| "learning_rate": 2e-05, | |
| "loss": 1.6391, | |
| "mean_token_accuracy": 0.6572432741522789, | |
| "num_tokens": 1474818.0, | |
| "step": 11 | |
| }, | |
| { | |
| "entropy": 1.31825902312994, | |
| "epoch": 0.6134185303514377, | |
| "grad_norm": 0.7459490299224854, | |
| "learning_rate": 1.977777777777778e-05, | |
| "loss": 1.6261, | |
| "mean_token_accuracy": 0.6601505167782307, | |
| "num_tokens": 1608731.0, | |
| "step": 12 | |
| }, | |
| { | |
| "entropy": 1.322536252439022, | |
| "epoch": 0.6645367412140575, | |
| "grad_norm": 0.7278594970703125, | |
| "learning_rate": 1.9555555555555557e-05, | |
| "loss": 1.6056, | |
| "mean_token_accuracy": 0.6621886678040028, | |
| "num_tokens": 1742229.0, | |
| "step": 13 | |
| }, | |
| { | |
| "entropy": 1.3205925300717354, | |
| "epoch": 0.7156549520766773, | |
| "grad_norm": 0.6732656359672546, | |
| "learning_rate": 1.9333333333333333e-05, | |
| "loss": 1.587, | |
| "mean_token_accuracy": 0.6625417172908783, | |
| "num_tokens": 1875890.0, | |
| "step": 14 | |
| }, | |
| { | |
| "entropy": 1.3151337951421738, | |
| "epoch": 0.7667731629392971, | |
| "grad_norm": 0.6385083794593811, | |
| "learning_rate": 1.9111111111111113e-05, | |
| "loss": 1.5551, | |
| "mean_token_accuracy": 0.6706915572285652, | |
| "num_tokens": 2009719.0, | |
| "step": 15 | |
| }, | |
| { | |
| "entropy": 1.3125323951244354, | |
| "epoch": 0.8178913738019169, | |
| "grad_norm": 0.6251479387283325, | |
| "learning_rate": 1.888888888888889e-05, | |
| "loss": 1.5312, | |
| "mean_token_accuracy": 0.6725872829556465, | |
| "num_tokens": 2143710.0, | |
| "step": 16 | |
| }, | |
| { | |
| "entropy": 1.3094838485121727, | |
| "epoch": 0.8690095846645367, | |
| "grad_norm": 0.6229560375213623, | |
| "learning_rate": 1.866666666666667e-05, | |
| "loss": 1.5063, | |
| "mean_token_accuracy": 0.6759799160063267, | |
| "num_tokens": 2277556.0, | |
| "step": 17 | |
| }, | |
| { | |
| "entropy": 1.3096359893679619, | |
| "epoch": 0.9201277955271565, | |
| "grad_norm": 0.6263618469238281, | |
| "learning_rate": 1.8444444444444448e-05, | |
| "loss": 1.4846, | |
| "mean_token_accuracy": 0.6817599721252918, | |
| "num_tokens": 2411448.0, | |
| "step": 18 | |
| }, | |
| { | |
| "entropy": 1.2960194796323776, | |
| "epoch": 0.9712460063897763, | |
| "grad_norm": 0.6250044703483582, | |
| "learning_rate": 1.8222222222222224e-05, | |
| "loss": 1.4525, | |
| "mean_token_accuracy": 0.6899448521435261, | |
| "num_tokens": 2545738.0, | |
| "step": 19 | |
| }, | |
| { | |
| "entropy": 1.2956058846579657, | |
| "epoch": 1.0, | |
| "grad_norm": 0.6314756870269775, | |
| "learning_rate": 1.8e-05, | |
| "loss": 1.4391, | |
| "mean_token_accuracy": 0.6940541995896233, | |
| "num_tokens": 2616926.0, | |
| "step": 20 | |
| }, | |
| { | |
| "entropy": 1.2914148643612862, | |
| "epoch": 1.0511182108626198, | |
| "grad_norm": 0.631912887096405, | |
| "learning_rate": 1.7777777777777777e-05, | |
| "loss": 1.4127, | |
| "mean_token_accuracy": 0.6994642727077007, | |
| "num_tokens": 2750943.0, | |
| "step": 21 | |
| }, | |
| { | |
| "entropy": 1.2763815149664879, | |
| "epoch": 1.1022364217252396, | |
| "grad_norm": 0.6287756562232971, | |
| "learning_rate": 1.7555555555555556e-05, | |
| "loss": 1.3806, | |
| "mean_token_accuracy": 0.7029485926032066, | |
| "num_tokens": 2885149.0, | |
| "step": 22 | |
| }, | |
| { | |
| "entropy": 1.2685835510492325, | |
| "epoch": 1.1533546325878594, | |
| "grad_norm": 0.6242936253547668, | |
| "learning_rate": 1.7333333333333336e-05, | |
| "loss": 1.3606, | |
| "mean_token_accuracy": 0.7043894305825233, | |
| "num_tokens": 3019433.0, | |
| "step": 23 | |
| }, | |
| { | |
| "entropy": 1.263872005045414, | |
| "epoch": 1.2044728434504792, | |
| "grad_norm": 0.6192987561225891, | |
| "learning_rate": 1.7111111111111112e-05, | |
| "loss": 1.337, | |
| "mean_token_accuracy": 0.709864255040884, | |
| "num_tokens": 3153309.0, | |
| "step": 24 | |
| }, | |
| { | |
| "entropy": 1.2505493015050888, | |
| "epoch": 1.255591054313099, | |
| "grad_norm": 0.6113152503967285, | |
| "learning_rate": 1.688888888888889e-05, | |
| "loss": 1.3113, | |
| "mean_token_accuracy": 0.7119522020220757, | |
| "num_tokens": 3287322.0, | |
| "step": 25 | |
| }, | |
| { | |
| "entropy": 1.242555320262909, | |
| "epoch": 1.3067092651757188, | |
| "grad_norm": 0.597195029258728, | |
| "learning_rate": 1.6666666666666667e-05, | |
| "loss": 1.2892, | |
| "mean_token_accuracy": 0.7140648253262043, | |
| "num_tokens": 3421258.0, | |
| "step": 26 | |
| }, | |
| { | |
| "entropy": 1.229688823223114, | |
| "epoch": 1.3578274760383386, | |
| "grad_norm": 0.586483359336853, | |
| "learning_rate": 1.6444444444444444e-05, | |
| "loss": 1.264, | |
| "mean_token_accuracy": 0.7178861573338509, | |
| "num_tokens": 3555933.0, | |
| "step": 27 | |
| }, | |
| { | |
| "entropy": 1.2219947651028633, | |
| "epoch": 1.4089456869009584, | |
| "grad_norm": 0.5772027373313904, | |
| "learning_rate": 1.6222222222222223e-05, | |
| "loss": 1.2416, | |
| "mean_token_accuracy": 0.7191276662051678, | |
| "num_tokens": 3689685.0, | |
| "step": 28 | |
| }, | |
| { | |
| "entropy": 1.2158958613872528, | |
| "epoch": 1.4600638977635783, | |
| "grad_norm": 0.5700508952140808, | |
| "learning_rate": 1.6000000000000003e-05, | |
| "loss": 1.2201, | |
| "mean_token_accuracy": 0.7224024310708046, | |
| "num_tokens": 3823781.0, | |
| "step": 29 | |
| }, | |
| { | |
| "entropy": 1.2152001112699509, | |
| "epoch": 1.511182108626198, | |
| "grad_norm": 0.5722755789756775, | |
| "learning_rate": 1.577777777777778e-05, | |
| "loss": 1.2091, | |
| "mean_token_accuracy": 0.722760371863842, | |
| "num_tokens": 3956942.0, | |
| "step": 30 | |
| }, | |
| { | |
| "entropy": 1.1953495219349861, | |
| "epoch": 1.5623003194888179, | |
| "grad_norm": 0.5692858695983887, | |
| "learning_rate": 1.555555555555556e-05, | |
| "loss": 1.1714, | |
| "mean_token_accuracy": 0.7302578240633011, | |
| "num_tokens": 4091313.0, | |
| "step": 31 | |
| }, | |
| { | |
| "entropy": 1.1933885142207146, | |
| "epoch": 1.6134185303514377, | |
| "grad_norm": 0.5751745700836182, | |
| "learning_rate": 1.5333333333333334e-05, | |
| "loss": 1.1615, | |
| "mean_token_accuracy": 0.7318685166537762, | |
| "num_tokens": 4225481.0, | |
| "step": 32 | |
| }, | |
| { | |
| "entropy": 1.1871799379587173, | |
| "epoch": 1.6645367412140575, | |
| "grad_norm": 0.5843569040298462, | |
| "learning_rate": 1.5111111111111112e-05, | |
| "loss": 1.1378, | |
| "mean_token_accuracy": 0.7380619496107101, | |
| "num_tokens": 4359158.0, | |
| "step": 33 | |
| }, | |
| { | |
| "entropy": 1.1805067732930183, | |
| "epoch": 1.7156549520766773, | |
| "grad_norm": 0.5917448997497559, | |
| "learning_rate": 1.488888888888889e-05, | |
| "loss": 1.1218, | |
| "mean_token_accuracy": 0.7423498816788197, | |
| "num_tokens": 4493077.0, | |
| "step": 34 | |
| }, | |
| { | |
| "entropy": 1.170977495610714, | |
| "epoch": 1.766773162939297, | |
| "grad_norm": 0.59839928150177, | |
| "learning_rate": 1.4666666666666666e-05, | |
| "loss": 1.096, | |
| "mean_token_accuracy": 0.7493030689656734, | |
| "num_tokens": 4627251.0, | |
| "step": 35 | |
| }, | |
| { | |
| "entropy": 1.1678270995616913, | |
| "epoch": 1.817891373801917, | |
| "grad_norm": 0.6055343151092529, | |
| "learning_rate": 1.4444444444444446e-05, | |
| "loss": 1.0802, | |
| "mean_token_accuracy": 0.7507753595709801, | |
| "num_tokens": 4761020.0, | |
| "step": 36 | |
| }, | |
| { | |
| "entropy": 1.158110834658146, | |
| "epoch": 1.8690095846645367, | |
| "grad_norm": 0.6079038381576538, | |
| "learning_rate": 1.4222222222222224e-05, | |
| "loss": 1.0628, | |
| "mean_token_accuracy": 0.755638737231493, | |
| "num_tokens": 4895183.0, | |
| "step": 37 | |
| }, | |
| { | |
| "entropy": 1.154124453663826, | |
| "epoch": 1.9201277955271565, | |
| "grad_norm": 0.6134995222091675, | |
| "learning_rate": 1.4e-05, | |
| "loss": 1.043, | |
| "mean_token_accuracy": 0.7614487372338772, | |
| "num_tokens": 5028509.0, | |
| "step": 38 | |
| }, | |
| { | |
| "entropy": 1.1415963619947433, | |
| "epoch": 1.9712460063897763, | |
| "grad_norm": 0.608974277973175, | |
| "learning_rate": 1.377777777777778e-05, | |
| "loss": 1.0203, | |
| "mean_token_accuracy": 0.7697952277958393, | |
| "num_tokens": 5162590.0, | |
| "step": 39 | |
| }, | |
| { | |
| "entropy": 1.1324340105056763, | |
| "epoch": 2.0, | |
| "grad_norm": 0.6109018325805664, | |
| "learning_rate": 1.3555555555555557e-05, | |
| "loss": 0.9992, | |
| "mean_token_accuracy": 0.7730923626157973, | |
| "num_tokens": 5233852.0, | |
| "step": 40 | |
| }, | |
| { | |
| "entropy": 1.1270885691046715, | |
| "epoch": 2.0511182108626196, | |
| "grad_norm": 0.6178512573242188, | |
| "learning_rate": 1.3333333333333333e-05, | |
| "loss": 0.982, | |
| "mean_token_accuracy": 0.7760093286633492, | |
| "num_tokens": 5367686.0, | |
| "step": 41 | |
| }, | |
| { | |
| "entropy": 1.1163304820656776, | |
| "epoch": 2.1022364217252396, | |
| "grad_norm": 0.6236265301704407, | |
| "learning_rate": 1.3111111111111113e-05, | |
| "loss": 0.9606, | |
| "mean_token_accuracy": 0.7832612432539463, | |
| "num_tokens": 5501101.0, | |
| "step": 42 | |
| }, | |
| { | |
| "entropy": 1.1050259843468666, | |
| "epoch": 2.1533546325878596, | |
| "grad_norm": 0.6330907940864563, | |
| "learning_rate": 1.288888888888889e-05, | |
| "loss": 0.9442, | |
| "mean_token_accuracy": 0.7889900915324688, | |
| "num_tokens": 5634943.0, | |
| "step": 43 | |
| }, | |
| { | |
| "entropy": 1.0991561263799667, | |
| "epoch": 2.2044728434504792, | |
| "grad_norm": 0.6459551453590393, | |
| "learning_rate": 1.2666666666666667e-05, | |
| "loss": 0.9323, | |
| "mean_token_accuracy": 0.7888863421976566, | |
| "num_tokens": 5768035.0, | |
| "step": 44 | |
| }, | |
| { | |
| "entropy": 1.080874651670456, | |
| "epoch": 2.255591054313099, | |
| "grad_norm": 0.6497413516044617, | |
| "learning_rate": 1.2444444444444446e-05, | |
| "loss": 0.9089, | |
| "mean_token_accuracy": 0.7905767410993576, | |
| "num_tokens": 5901673.0, | |
| "step": 45 | |
| }, | |
| { | |
| "entropy": 1.0577788427472115, | |
| "epoch": 2.306709265175719, | |
| "grad_norm": 0.6514442563056946, | |
| "learning_rate": 1.2222222222222224e-05, | |
| "loss": 0.8845, | |
| "mean_token_accuracy": 0.7969931028783321, | |
| "num_tokens": 6036094.0, | |
| "step": 46 | |
| }, | |
| { | |
| "entropy": 1.0467759743332863, | |
| "epoch": 2.357827476038339, | |
| "grad_norm": 0.6560313105583191, | |
| "learning_rate": 1.2e-05, | |
| "loss": 0.8659, | |
| "mean_token_accuracy": 0.802506472915411, | |
| "num_tokens": 6170134.0, | |
| "step": 47 | |
| }, | |
| { | |
| "entropy": 1.0260686576366425, | |
| "epoch": 2.4089456869009584, | |
| "grad_norm": 0.6551167368888855, | |
| "learning_rate": 1.177777777777778e-05, | |
| "loss": 0.8486, | |
| "mean_token_accuracy": 0.8058239929378033, | |
| "num_tokens": 6304397.0, | |
| "step": 48 | |
| }, | |
| { | |
| "entropy": 1.0083096772432327, | |
| "epoch": 2.460063897763578, | |
| "grad_norm": 0.6541892290115356, | |
| "learning_rate": 1.1555555555555556e-05, | |
| "loss": 0.8356, | |
| "mean_token_accuracy": 0.8059861660003662, | |
| "num_tokens": 6438340.0, | |
| "step": 49 | |
| }, | |
| { | |
| "entropy": 0.9794742912054062, | |
| "epoch": 2.511182108626198, | |
| "grad_norm": 0.6508305668830872, | |
| "learning_rate": 1.1333333333333334e-05, | |
| "loss": 0.8151, | |
| "mean_token_accuracy": 0.8144995309412479, | |
| "num_tokens": 6573003.0, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 0.9591087996959686, | |
| "epoch": 2.562300319488818, | |
| "grad_norm": 0.6544970273971558, | |
| "learning_rate": 1.1111111111111113e-05, | |
| "loss": 0.7982, | |
| "mean_token_accuracy": 0.821755301207304, | |
| "num_tokens": 6707094.0, | |
| "step": 51 | |
| }, | |
| { | |
| "entropy": 0.9311400800943375, | |
| "epoch": 2.6134185303514377, | |
| "grad_norm": 0.661201536655426, | |
| "learning_rate": 1.088888888888889e-05, | |
| "loss": 0.7748, | |
| "mean_token_accuracy": 0.8267807699739933, | |
| "num_tokens": 6841315.0, | |
| "step": 52 | |
| }, | |
| { | |
| "entropy": 0.9006332121789455, | |
| "epoch": 2.6645367412140573, | |
| "grad_norm": 0.6626005172729492, | |
| "learning_rate": 1.0666666666666667e-05, | |
| "loss": 0.7543, | |
| "mean_token_accuracy": 0.8362897895276546, | |
| "num_tokens": 6976249.0, | |
| "step": 53 | |
| }, | |
| { | |
| "entropy": 0.8812699876725674, | |
| "epoch": 2.7156549520766773, | |
| "grad_norm": 0.674384355545044, | |
| "learning_rate": 1.0444444444444445e-05, | |
| "loss": 0.7417, | |
| "mean_token_accuracy": 0.8424257524311543, | |
| "num_tokens": 7110449.0, | |
| "step": 54 | |
| }, | |
| { | |
| "entropy": 0.8507667072117329, | |
| "epoch": 2.7667731629392973, | |
| "grad_norm": 0.695296049118042, | |
| "learning_rate": 1.0222222222222223e-05, | |
| "loss": 0.7201, | |
| "mean_token_accuracy": 0.8484714813530445, | |
| "num_tokens": 7244307.0, | |
| "step": 55 | |
| }, | |
| { | |
| "entropy": 0.8221894763410091, | |
| "epoch": 2.817891373801917, | |
| "grad_norm": 0.7484252452850342, | |
| "learning_rate": 1e-05, | |
| "loss": 0.7067, | |
| "mean_token_accuracy": 0.8503611832857132, | |
| "num_tokens": 7378331.0, | |
| "step": 56 | |
| }, | |
| { | |
| "entropy": 0.7954868413507938, | |
| "epoch": 2.8690095846645365, | |
| "grad_norm": 0.8117406368255615, | |
| "learning_rate": 9.777777777777779e-06, | |
| "loss": 0.6898, | |
| "mean_token_accuracy": 0.8546305038034916, | |
| "num_tokens": 7511657.0, | |
| "step": 57 | |
| }, | |
| { | |
| "entropy": 0.7787146084010601, | |
| "epoch": 2.9201277955271565, | |
| "grad_norm": 0.7788737416267395, | |
| "learning_rate": 9.555555555555556e-06, | |
| "loss": 0.6761, | |
| "mean_token_accuracy": 0.8574383407831192, | |
| "num_tokens": 7645680.0, | |
| "step": 58 | |
| }, | |
| { | |
| "entropy": 0.7654511369764805, | |
| "epoch": 2.9712460063897765, | |
| "grad_norm": 0.6763613820075989, | |
| "learning_rate": 9.333333333333334e-06, | |
| "loss": 0.6651, | |
| "mean_token_accuracy": 0.8578773178160191, | |
| "num_tokens": 7779761.0, | |
| "step": 59 | |
| }, | |
| { | |
| "entropy": 0.7472146418359544, | |
| "epoch": 3.0, | |
| "grad_norm": 0.6285676956176758, | |
| "learning_rate": 9.111111111111112e-06, | |
| "loss": 0.6435, | |
| "mean_token_accuracy": 0.86120914750629, | |
| "num_tokens": 7850778.0, | |
| "step": 60 | |
| }, | |
| { | |
| "entropy": 0.7355797588825226, | |
| "epoch": 3.0511182108626196, | |
| "grad_norm": 0.6584138870239258, | |
| "learning_rate": 8.888888888888888e-06, | |
| "loss": 0.6312, | |
| "mean_token_accuracy": 0.8601678982377052, | |
| "num_tokens": 7984664.0, | |
| "step": 61 | |
| }, | |
| { | |
| "entropy": 0.7231738641858101, | |
| "epoch": 3.1022364217252396, | |
| "grad_norm": 0.6893587112426758, | |
| "learning_rate": 8.666666666666668e-06, | |
| "loss": 0.621, | |
| "mean_token_accuracy": 0.8597363233566284, | |
| "num_tokens": 8118372.0, | |
| "step": 62 | |
| }, | |
| { | |
| "entropy": 0.6995701305568218, | |
| "epoch": 3.1533546325878596, | |
| "grad_norm": 0.6631926894187927, | |
| "learning_rate": 8.444444444444446e-06, | |
| "loss": 0.5998, | |
| "mean_token_accuracy": 0.8639725260436535, | |
| "num_tokens": 8252383.0, | |
| "step": 63 | |
| }, | |
| { | |
| "entropy": 0.6806200109422207, | |
| "epoch": 3.2044728434504792, | |
| "grad_norm": 0.6041826009750366, | |
| "learning_rate": 8.222222222222222e-06, | |
| "loss": 0.5892, | |
| "mean_token_accuracy": 0.8684131018817425, | |
| "num_tokens": 8387058.0, | |
| "step": 64 | |
| }, | |
| { | |
| "entropy": 0.6625471338629723, | |
| "epoch": 3.255591054313099, | |
| "grad_norm": 0.6061173677444458, | |
| "learning_rate": 8.000000000000001e-06, | |
| "loss": 0.578, | |
| "mean_token_accuracy": 0.8719681017100811, | |
| "num_tokens": 8521167.0, | |
| "step": 65 | |
| }, | |
| { | |
| "entropy": 0.6458840593695641, | |
| "epoch": 3.306709265175719, | |
| "grad_norm": 0.6620640158653259, | |
| "learning_rate": 7.77777777777778e-06, | |
| "loss": 0.5631, | |
| "mean_token_accuracy": 0.8758602887392044, | |
| "num_tokens": 8654886.0, | |
| "step": 66 | |
| }, | |
| { | |
| "entropy": 0.63466951623559, | |
| "epoch": 3.357827476038339, | |
| "grad_norm": 0.6536484956741333, | |
| "learning_rate": 7.555555555555556e-06, | |
| "loss": 0.5574, | |
| "mean_token_accuracy": 0.8795712888240814, | |
| "num_tokens": 8788388.0, | |
| "step": 67 | |
| }, | |
| { | |
| "entropy": 0.6206231378018856, | |
| "epoch": 3.4089456869009584, | |
| "grad_norm": 0.5983281135559082, | |
| "learning_rate": 7.333333333333333e-06, | |
| "loss": 0.5412, | |
| "mean_token_accuracy": 0.8814935386180878, | |
| "num_tokens": 8921968.0, | |
| "step": 68 | |
| }, | |
| { | |
| "entropy": 0.6042437292635441, | |
| "epoch": 3.460063897763578, | |
| "grad_norm": 0.568672776222229, | |
| "learning_rate": 7.111111111111112e-06, | |
| "loss": 0.5308, | |
| "mean_token_accuracy": 0.8831478171050549, | |
| "num_tokens": 9056066.0, | |
| "step": 69 | |
| }, | |
| { | |
| "entropy": 0.601006530225277, | |
| "epoch": 3.511182108626198, | |
| "grad_norm": 0.5580371618270874, | |
| "learning_rate": 6.88888888888889e-06, | |
| "loss": 0.5254, | |
| "mean_token_accuracy": 0.8843187876045704, | |
| "num_tokens": 9189930.0, | |
| "step": 70 | |
| }, | |
| { | |
| "entropy": 0.5836833901703358, | |
| "epoch": 3.562300319488818, | |
| "grad_norm": 0.541730523109436, | |
| "learning_rate": 6.666666666666667e-06, | |
| "loss": 0.5103, | |
| "mean_token_accuracy": 0.8875499293208122, | |
| "num_tokens": 9323696.0, | |
| "step": 71 | |
| }, | |
| { | |
| "entropy": 0.5648492202162743, | |
| "epoch": 3.6134185303514377, | |
| "grad_norm": 0.5018172860145569, | |
| "learning_rate": 6.444444444444445e-06, | |
| "loss": 0.4952, | |
| "mean_token_accuracy": 0.8905236721038818, | |
| "num_tokens": 9457840.0, | |
| "step": 72 | |
| }, | |
| { | |
| "entropy": 0.5578960217535496, | |
| "epoch": 3.6645367412140573, | |
| "grad_norm": 0.4896445572376251, | |
| "learning_rate": 6.222222222222223e-06, | |
| "loss": 0.4918, | |
| "mean_token_accuracy": 0.8900170363485813, | |
| "num_tokens": 9591569.0, | |
| "step": 73 | |
| }, | |
| { | |
| "entropy": 0.5444952994585037, | |
| "epoch": 3.7156549520766773, | |
| "grad_norm": 0.4938449263572693, | |
| "learning_rate": 6e-06, | |
| "loss": 0.4859, | |
| "mean_token_accuracy": 0.8906422667205334, | |
| "num_tokens": 9725752.0, | |
| "step": 74 | |
| }, | |
| { | |
| "entropy": 0.5357677936553955, | |
| "epoch": 3.7667731629392973, | |
| "grad_norm": 0.4953802227973938, | |
| "learning_rate": 5.777777777777778e-06, | |
| "loss": 0.4771, | |
| "mean_token_accuracy": 0.8924155794084072, | |
| "num_tokens": 9859936.0, | |
| "step": 75 | |
| }, | |
| { | |
| "entropy": 0.5303498916327953, | |
| "epoch": 3.817891373801917, | |
| "grad_norm": 0.47086596488952637, | |
| "learning_rate": 5.555555555555557e-06, | |
| "loss": 0.4745, | |
| "mean_token_accuracy": 0.8935006484389305, | |
| "num_tokens": 9993839.0, | |
| "step": 76 | |
| }, | |
| { | |
| "entropy": 0.5241257101297379, | |
| "epoch": 3.8690095846645365, | |
| "grad_norm": 0.46224120259284973, | |
| "learning_rate": 5.333333333333334e-06, | |
| "loss": 0.4695, | |
| "mean_token_accuracy": 0.8958121947944164, | |
| "num_tokens": 10127352.0, | |
| "step": 77 | |
| }, | |
| { | |
| "entropy": 0.5198668912053108, | |
| "epoch": 3.9201277955271565, | |
| "grad_norm": 0.4502220153808594, | |
| "learning_rate": 5.1111111111111115e-06, | |
| "loss": 0.458, | |
| "mean_token_accuracy": 0.8979315273463726, | |
| "num_tokens": 10261669.0, | |
| "step": 78 | |
| }, | |
| { | |
| "entropy": 0.5175420753657818, | |
| "epoch": 3.9712460063897765, | |
| "grad_norm": 0.460803359746933, | |
| "learning_rate": 4.888888888888889e-06, | |
| "loss": 0.4607, | |
| "mean_token_accuracy": 0.8980869241058826, | |
| "num_tokens": 10396077.0, | |
| "step": 79 | |
| }, | |
| { | |
| "entropy": 0.5124501652187772, | |
| "epoch": 4.0, | |
| "grad_norm": 0.4524611532688141, | |
| "learning_rate": 4.666666666666667e-06, | |
| "loss": 0.4524, | |
| "mean_token_accuracy": 0.8987092839346992, | |
| "num_tokens": 10467704.0, | |
| "step": 80 | |
| }, | |
| { | |
| "entropy": 0.5044496711343527, | |
| "epoch": 4.05111821086262, | |
| "grad_norm": 0.4456121623516083, | |
| "learning_rate": 4.444444444444444e-06, | |
| "loss": 0.4517, | |
| "mean_token_accuracy": 0.8999496810138226, | |
| "num_tokens": 10601711.0, | |
| "step": 81 | |
| }, | |
| { | |
| "entropy": 0.49921168200671673, | |
| "epoch": 4.102236421725239, | |
| "grad_norm": 0.43337103724479675, | |
| "learning_rate": 4.222222222222223e-06, | |
| "loss": 0.4406, | |
| "mean_token_accuracy": 0.9021755084395409, | |
| "num_tokens": 10735922.0, | |
| "step": 82 | |
| }, | |
| { | |
| "entropy": 0.49412832595407963, | |
| "epoch": 4.15335463258786, | |
| "grad_norm": 0.41774383187294006, | |
| "learning_rate": 4.000000000000001e-06, | |
| "loss": 0.4383, | |
| "mean_token_accuracy": 0.9036052823066711, | |
| "num_tokens": 10869453.0, | |
| "step": 83 | |
| }, | |
| { | |
| "entropy": 0.4927068557590246, | |
| "epoch": 4.204472843450479, | |
| "grad_norm": 0.4047223925590515, | |
| "learning_rate": 3.777777777777778e-06, | |
| "loss": 0.4408, | |
| "mean_token_accuracy": 0.903934184461832, | |
| "num_tokens": 11003205.0, | |
| "step": 84 | |
| }, | |
| { | |
| "entropy": 0.4868635721504688, | |
| "epoch": 4.255591054313099, | |
| "grad_norm": 0.4004896581172943, | |
| "learning_rate": 3.555555555555556e-06, | |
| "loss": 0.4333, | |
| "mean_token_accuracy": 0.9073567539453506, | |
| "num_tokens": 11136940.0, | |
| "step": 85 | |
| }, | |
| { | |
| "entropy": 0.48354302905499935, | |
| "epoch": 4.306709265175719, | |
| "grad_norm": 0.39671915769577026, | |
| "learning_rate": 3.3333333333333333e-06, | |
| "loss": 0.4337, | |
| "mean_token_accuracy": 0.9065112210810184, | |
| "num_tokens": 11271296.0, | |
| "step": 86 | |
| }, | |
| { | |
| "entropy": 0.47907497733831406, | |
| "epoch": 4.357827476038339, | |
| "grad_norm": 0.389957070350647, | |
| "learning_rate": 3.1111111111111116e-06, | |
| "loss": 0.4262, | |
| "mean_token_accuracy": 0.9076332710683346, | |
| "num_tokens": 11405229.0, | |
| "step": 87 | |
| }, | |
| { | |
| "entropy": 0.47460952028632164, | |
| "epoch": 4.4089456869009584, | |
| "grad_norm": 0.3816507160663605, | |
| "learning_rate": 2.888888888888889e-06, | |
| "loss": 0.4241, | |
| "mean_token_accuracy": 0.9076499193906784, | |
| "num_tokens": 11538720.0, | |
| "step": 88 | |
| }, | |
| { | |
| "entropy": 0.475073354318738, | |
| "epoch": 4.460063897763578, | |
| "grad_norm": 0.3826192319393158, | |
| "learning_rate": 2.666666666666667e-06, | |
| "loss": 0.4279, | |
| "mean_token_accuracy": 0.9066745862364769, | |
| "num_tokens": 11672728.0, | |
| "step": 89 | |
| }, | |
| { | |
| "entropy": 0.46915419213473797, | |
| "epoch": 4.511182108626198, | |
| "grad_norm": 0.3762473464012146, | |
| "learning_rate": 2.4444444444444447e-06, | |
| "loss": 0.4196, | |
| "mean_token_accuracy": 0.9083298407495022, | |
| "num_tokens": 11806722.0, | |
| "step": 90 | |
| }, | |
| { | |
| "entropy": 0.47265371307730675, | |
| "epoch": 4.562300319488818, | |
| "grad_norm": 0.3803304135799408, | |
| "learning_rate": 2.222222222222222e-06, | |
| "loss": 0.4235, | |
| "mean_token_accuracy": 0.9058551266789436, | |
| "num_tokens": 11940711.0, | |
| "step": 91 | |
| }, | |
| { | |
| "entropy": 0.4641486182808876, | |
| "epoch": 4.613418530351438, | |
| "grad_norm": 0.39110618829727173, | |
| "learning_rate": 2.0000000000000003e-06, | |
| "loss": 0.4151, | |
| "mean_token_accuracy": 0.9083586372435093, | |
| "num_tokens": 12074223.0, | |
| "step": 92 | |
| }, | |
| { | |
| "entropy": 0.46577116660773754, | |
| "epoch": 4.664536741214057, | |
| "grad_norm": 0.37866729497909546, | |
| "learning_rate": 1.777777777777778e-06, | |
| "loss": 0.4182, | |
| "mean_token_accuracy": 0.9072682671248913, | |
| "num_tokens": 12208745.0, | |
| "step": 93 | |
| }, | |
| { | |
| "entropy": 0.46281062439084053, | |
| "epoch": 4.715654952076678, | |
| "grad_norm": 0.37949833273887634, | |
| "learning_rate": 1.5555555555555558e-06, | |
| "loss": 0.4165, | |
| "mean_token_accuracy": 0.9079276360571384, | |
| "num_tokens": 12342771.0, | |
| "step": 94 | |
| }, | |
| { | |
| "entropy": 0.46098384633660316, | |
| "epoch": 4.766773162939297, | |
| "grad_norm": 0.3821849226951599, | |
| "learning_rate": 1.3333333333333334e-06, | |
| "loss": 0.4129, | |
| "mean_token_accuracy": 0.9086062870919704, | |
| "num_tokens": 12477150.0, | |
| "step": 95 | |
| }, | |
| { | |
| "entropy": 0.4603565651923418, | |
| "epoch": 4.817891373801917, | |
| "grad_norm": 0.371977299451828, | |
| "learning_rate": 1.111111111111111e-06, | |
| "loss": 0.4169, | |
| "mean_token_accuracy": 0.906544703990221, | |
| "num_tokens": 12610985.0, | |
| "step": 96 | |
| }, | |
| { | |
| "entropy": 0.45920143835246563, | |
| "epoch": 4.8690095846645365, | |
| "grad_norm": 0.3711145222187042, | |
| "learning_rate": 8.88888888888889e-07, | |
| "loss": 0.4146, | |
| "mean_token_accuracy": 0.907651498913765, | |
| "num_tokens": 12745022.0, | |
| "step": 97 | |
| }, | |
| { | |
| "entropy": 0.4641130045056343, | |
| "epoch": 4.920127795527156, | |
| "grad_norm": 0.3696272075176239, | |
| "learning_rate": 6.666666666666667e-07, | |
| "loss": 0.4185, | |
| "mean_token_accuracy": 0.9069899655878544, | |
| "num_tokens": 12879632.0, | |
| "step": 98 | |
| }, | |
| { | |
| "entropy": 0.45881704427301884, | |
| "epoch": 4.9712460063897765, | |
| "grad_norm": 0.36522284150123596, | |
| "learning_rate": 4.444444444444445e-07, | |
| "loss": 0.4125, | |
| "mean_token_accuracy": 0.9074114449322224, | |
| "num_tokens": 13013651.0, | |
| "step": 99 | |
| }, | |
| { | |
| "entropy": 0.45472526881429887, | |
| "epoch": 5.0, | |
| "grad_norm": 0.3692299723625183, | |
| "learning_rate": 2.2222222222222224e-07, | |
| "loss": 0.4081, | |
| "mean_token_accuracy": 0.9088096486197578, | |
| "num_tokens": 13084630.0, | |
| "step": 100 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 100, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 10, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.1414531809633075e+17, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |